Skip to content

Commit cfcda31

Browse files
committed
merge
Signed-off-by: Nicholas Gates <nick@nickgates.com>
2 parents 49aaada + ff21366 commit cfcda31

179 files changed

Lines changed: 3431 additions & 1477 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/bench-pr.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ jobs:
141141
"id": "clickbench-nvme",
142142
"subcommand": "clickbench",
143143
"name": "Clickbench on NVME",
144-
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb"
144+
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
145+
"extra_data_formats": "vortex-compact"
145146
},
146147
{
147148
"id": "tpch-nvme",

.github/workflows/nightly-bench.yml

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,33 +23,8 @@ jobs:
2323
with:
2424
mode: "develop"
2525
machine_type: ${{ matrix.machine_type.instance_name }}
26-
# datafusion:vortex uses a lot of memory
2726
benchmark_matrix: |
2827
[
29-
{
30-
"id": "clickbench-nvme",
31-
"subcommand": "clickbench",
32-
"name": "Clickbench on NVME",
33-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
34-
"build_lance": true
35-
},
36-
{
37-
"id": "tpch-nvme",
38-
"subcommand": "tpch",
39-
"name": "TPC-H on NVME",
40-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
41-
"scale_factor": "10.0",
42-
"build_lance": true
43-
},
44-
{
45-
"id": "tpch-s3",
46-
"subcommand": "tpch",
47-
"name": "TPC-H on S3",
48-
"local_dir": "vortex-bench/data/tpch/10.0",
49-
"remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
50-
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex",
51-
"scale_factor": "10.0"
52-
},
5328
{
5429
"id": "tpch-nvme",
5530
"subcommand": "tpch",
@@ -74,6 +49,3 @@ jobs:
7449
machine_type:
7550
- id: x86
7651
instance_name: c6id.8xlarge
77-
# TODO(joe): support other arch
78-
# - id: arm64
79-
# instance_name: c6gd.8xlarge

.github/workflows/sql-benchmarks.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ jobs:
154154
# Extract all unique formats from targets (e.g., "datafusion:parquet,duckdb:vortex" -> "parquet,vortex")
155155
all_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | sed 's/^[^:]*://' | sort -u | tr '\n' ',' | sed 's/,$//')
156156
157+
# Append extra data formats if specified (for file size tracking without benchmarking)
158+
if [ -n "${{ matrix.extra_data_formats }}" ]; then
159+
all_formats="$all_formats,${{ matrix.extra_data_formats }}"
160+
fi
161+
157162
# Build options string if scale_factor is set
158163
opts=""
159164
if [ -n "${{ matrix.scale_factor }}" ]; then

AGENTS.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Vortex
2+
3+
## Development Guidelines
4+
5+
* project is a monorepo Rust workspace, java bindings in `/java`, python bindings in `/vortex-python`
6+
* run `cargo build -p` to build a specific crate
7+
* use `cargo clippy --all-targets --all-features` to make sure a project is free of lint issues. Please do this every
8+
time you reach a stopping point or think you've finished work.
9+
* run `cargo +nightly fmt --all` to format Rust source files. Please do this every time you reach a stopping point or
10+
think you've finished work.
11+
* run `cargo xtask public-api` to re-generate the public API lock files. Please do this every time you reach a stopping
12+
point or think you've finished work.
13+
* you can try running
14+
`cargo fix --lib --allow-dirty --allow-staged && cargo clippy --fix --lib --allow-dirty --allow-staged` to
15+
automatically many fix minor errors.
16+
17+
## Architecture
18+
19+
* `vortex-buffer` defines zero-copy aligned `Buffer<T>` and `BufferMut<T>` that are guaranteed
20+
to be aligned to `T` (or whatever requested runtime alignment).
21+
* `vortex-array/src/dtype` contains the basic `DType` logical type enum that is the basis of the Vortex
22+
type system
23+
* `vortex-array` contains the basic `Array` trait, as well as several encodings which impl
24+
that trait for each encoding. It includes all of most of the Apache Arrow encodings.
25+
* More exotic compressed encodings live in the crates inside of `/encodings/*`
26+
* File IO is defined in `vortex-file`. It uses the concept of a `LayoutReader` defined
27+
in `vortex-layout` crate.
28+
* `/vortex-python` contains the python bindings. rst flavored docs for the project are in `/docs`
29+
30+
## Code Style
31+
32+
* Prefer `impl AsRef<T>` to `&T` for public interfaces where possible, e.g. `impl AsRef<Path>`
33+
* Avoid usage of unsafe where not necessary, use zero-cost safe abstractions wherever possible,
34+
or cheap non-zero-cost abstractions.
35+
* Every new public API definition must have a doc comment. Examples are nice to have but not
36+
strictly required.
37+
* Use `vortex_err!` to create a `VortexError` with a format string and `vortex_bail!` to do the same but immediately
38+
return it as a `VortexResult<T>` to the surrounding context.
39+
* When writing tests, strongly consider using `rstest` cases to parameterize repetitive test logic.
40+
* If you want to create a large number of tests to an existing file module called `foo.rs`, and if you think doing so
41+
would
42+
be too many to inline in a `tests` submodule within `foo.rs`, then first promote `foo` to a directory module. You can
43+
do
44+
this by running `mkdir foo && mv foo.rs foo/mod.rs`. Then, you can create a test file `foo/tests.rs` that you include
45+
in `foo/mod.rs` with the appropriate test config flag.
46+
* If you encounter clippy errors in tests that should only pertain to production code (e.g., prohibiting panic/unwrap,
47+
possible numerical truncation, etc.), then consider allowing those lints at the test module level.
48+
* Prefer naming test modules `tests`, not `test`.
49+
* Prefer having test return VortexResult<()> and use ? over unwrap.
50+
* All imports must be at the top of the module, never inside functions. The only exception is `#[cfg(test)]` blocks,
51+
where imports should be at the top of the test module. Function-scoped imports are only acceptable when (a) required,
52+
or (b) it would be exceptionally verbose otherwise, such as a match statement where left and right sides have similar
53+
names.
54+
* Imports should be preferred over qualified identifiers.
55+
* Only write comments that explain non-obvious logic or important context. Avoid commenting simple or self-explanatory
56+
code.
57+
* Use `assert_arrays_eq!` macro for comparing arrays in tests instead of element-by-element comparison.
58+
* Keep tests concise and to the point - avoid unnecessary setup or verbose assertions.
59+
* Run tests for a specific crate with `cargo test -p <crate-name>` (e.g., `cargo test -p vortex-array`).
60+
61+
## Other
62+
63+
* When summarizing your work, please produce summaries in valid Markdown that can be easily copied/pasted to Github.
64+
65+
## Commits
66+
67+
* All commits must be signed of by the committers in the form `Signed-off-by: "COMMITTER" <COMMITTER_EMAIL>`.

CLAUDE.md

Lines changed: 0 additions & 67 deletions
This file was deleted.

CLAUDE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AGENTS.md

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ needless_range_loop = "allow"
349349
or_fun_call = "deny"
350350
panic = "deny"
351351
# panic_in_result_fn = "deny" -- we cannot disable this for tests to use assertions
352+
clone_on_ref_ptr = "deny"
352353
redundant_clone = "deny"
353354
same_name_method = "deny"
354355
tests_outside_test_module = "deny"

REUSE.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ SPDX-License-Identifier = "CC-BY-4.0"
2626
# Utility code.
2727

2828
[[annotations]]
29-
path = ["**/README.md", "CLAUDE.md", "CONTRIBUTING.md", "STYLE.md", "tsc/**.md"]
29+
path = ["**/README.md", "AGENTS.md", "CLAUDE.md", "CONTRIBUTING.md", "STYLE.md", "tsc/**.md"]
3030
SPDX-FileCopyrightText = "Copyright the Vortex contributors"
3131
SPDX-License-Identifier = "CC-BY-4.0"
3232

benchmarks/compress-bench/src/parquet.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ impl Compressor for ParquetCompressor {
5353
// Read the input parquet file
5454
let file = File::open(parquet_path)?;
5555
let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
56-
let schema = builder.schema().clone();
56+
let schema = Arc::clone(builder.schema());
5757
let reader = builder.build()?;
5858
let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>()?;
5959

@@ -69,7 +69,7 @@ impl Compressor for ParquetCompressor {
6969
// First compress to get the bytes we'll decompress
7070
let file = File::open(parquet_path)?;
7171
let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
72-
let schema = builder.schema().clone();
72+
let schema = Arc::clone(builder.schema());
7373
let reader = builder.build()?;
7474
let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>()?;
7575

benchmarks/datafusion-bench/src/lib.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ pub fn make_object_store(
8181
.with_bucket_name(bucket_name)
8282
.build()?,
8383
);
84-
session
85-
.register_object_store(&Url::parse(&format!("s3://{bucket_name}/"))?, s3.clone());
84+
session.register_object_store(
85+
&Url::parse(&format!("s3://{bucket_name}/"))?,
86+
Arc::<object_store::aws::AmazonS3>::clone(&s3),
87+
);
8688
Ok(s3)
8789
}
8890
"gs" => {
@@ -92,13 +94,16 @@ pub fn make_object_store(
9294
.with_bucket_name(bucket_name)
9395
.build()?,
9496
);
95-
session
96-
.register_object_store(&Url::parse(&format!("gs://{bucket_name}/"))?, gcs.clone());
97+
session.register_object_store(
98+
&Url::parse(&format!("gs://{bucket_name}/"))?,
99+
Arc::<object_store::gcp::GoogleCloudStorage>::clone(&gcs),
100+
);
97101
Ok(gcs)
98102
}
99103
_ => {
100104
let fs = Arc::new(LocalFileSystem::default());
101-
session.register_object_store(&Url::parse("file:/")?, fs.clone());
105+
session
106+
.register_object_store(&Url::parse("file:/")?, Arc::<LocalFileSystem>::clone(&fs));
102107
Ok(fs)
103108
}
104109
}

0 commit comments

Comments
 (0)