Skip to content

Commit 6ce0cca

Browse files
committed
Updated datafusion-vortex submission with up-to-date versions and new dedicated tool
1 parent 863d0ba commit 6ce0cca

28 files changed

Lines changed: 737 additions & 148 deletions
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# DataFusion + Vortex
2+
3+
Partitioned Vortex dataset, converted one-for-one from the 100 ClickBench Parquet files and queried with [`vortex-datafusion-cli`].
4+
5+
[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli
6+
7+
## Cookbook: Generate benchmark results
8+
9+
Follow the same EC2 setup used by [datafusion-partitioned](../datafusion-partitioned/README.md), then run:
10+
11+
```bash
12+
cd ClickBench/datafusion-vortex-partitioned
13+
bash benchmark.sh
14+
```
15+
16+
The shared benchmark harness builds `vortex-datafusion-cli`, downloads the partitioned Parquet files, converts each `partitioned/hits_N.parquet` file into exactly one `vortex/hits_N.vortex` file, and runs the query set.
17+
18+
The `install` script checks out `vortex-datafusion-cli` tag `0.70.0-53.1.0`. CLI tags use `<vortex-version>-<df-version>`, where the first component is the `vortex-datafusion` crate version and the second is the DataFusion/DataFusion CLI version.
19+
20+
You can update/preview the results by running:
21+
22+
```bash
23+
./make-json.sh <machine-name> # Example. ./make-json.sh c6a.xlarge
24+
```
25+
26+
## Parquet To Vortex Conversion
27+
28+
Each input file is converted independently through `vortex-datafusion-cli`:
29+
30+
```sql
31+
CREATE EXTERNAL TABLE hits_parquet
32+
STORED AS PARQUET
33+
LOCATION 'partitioned/hits_0.parquet'
34+
OPTIONS ('binary_as_string' 'true');
35+
36+
COPY (
37+
SELECT * EXCEPT ("EventDate"),
38+
CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
39+
FROM hits_parquet
40+
) TO 'vortex/hits_0.vortex' STORED AS VORTEX;
41+
```
42+
43+
`binary_as_string=true` handles the incorrect Parquet logical annotation before Vortex is written. The produced Vortex files store those fields as strings, so benchmark reads use only the Vortex table registration.
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/bin/bash
22
# Thin shim — actual flow is in lib/benchmark-common.sh.
3-
# query_bench (the vortex driver) handles its own dataset download/conversion.
4-
export BENCH_DOWNLOAD_SCRIPT=""
3+
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned"
54
export BENCH_DURABLE=yes
65
export BENCH_RESTARTABLE=no
76
exec ../lib/benchmark-common.sh
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/bin/bash
22
set -e
33

4-
# Stateless system — confirm datafusion-cli (the playground's query
5-
# driver) is on PATH.
6-
command -v datafusion-cli >/dev/null
4+
DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli
5+
"$DF" -q -c "SELECT 1" >/dev/null
Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,3 @@
1-
CREATE EXTERNAL TABLE hits_raw
2-
STORED AS PARQUET
3-
LOCATION 'partitioned'
4-
OPTIONS ('binary_as_string' 'true');
5-
6-
CREATE VIEW hits AS
7-
SELECT * EXCEPT ("EventDate"),
8-
CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
9-
FROM hits_raw;
1+
CREATE EXTERNAL TABLE hits
2+
STORED AS VORTEX
3+
LOCATION 'vortex';
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/bin/bash
22
set -e
33

4-
# Sum the byte counts of all generated .vortex files.
5-
find . -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}'
4+
find vortex -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}'
Lines changed: 28 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,36 @@
11
#!/bin/bash
22
set -e
33

4-
VORTEX_VERSION=0.44.0
4+
if [ ! -x vortex-datafusion-cli/target/release/vortex-datafusion-cli ]; then
5+
# <vortex-version>-<datafusion-version>
6+
CLI_TAG=0.70.0-53.1.0
57

6-
if ! command -v cargo >/dev/null 2>&1; then
7-
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
8-
bash rust-init.sh -y
9-
fi
10-
export HOME=${HOME:=~}
11-
# shellcheck disable=SC1091
12-
source ~/.cargo/env
8+
if ! command -v cargo >/dev/null 2>&1; then
9+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
10+
bash rust-init.sh -y
11+
fi
12+
export HOME=${HOME:=~}
13+
# shellcheck disable=SC1091
14+
source "$HOME/.cargo/env"
1315

14-
sudo apt-get update -y
15-
# vortex-duckdb's build.rs runs bindgen, which needs libclang plus the
16-
# clang freestanding headers (stdbool.h etc.); without libclang-dev the
17-
# build fails with `'stdbool.h' file not found`.
18-
sudo apt-get install -y gcc jq build-essential git clang libclang-dev
16+
if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then
17+
if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then
18+
sudo fallocate -l 8G /swapfile
19+
sudo chmod 600 /swapfile
20+
sudo mkswap /swapfile
21+
sudo swapon /swapfile
22+
fi
23+
fi
1924

20-
if [ ! -d vortex ]; then
21-
git clone https://github.com/spiraldb/vortex.git
22-
fi
23-
(
24-
cd vortex
25-
git fetch --tags
26-
git checkout "$VORTEX_VERSION"
27-
# See datafusion-vortex/install — submodule update isn't idempotent
28-
# without sync + --force when a previous run left a partial clone.
29-
git submodule sync --recursive
30-
git submodule update --init --recursive --force
31-
cargo build --release --bin query_bench --package bench-vortex
32-
)
25+
sudo apt-get update -y
26+
sudo apt-get install -y build-essential clang cmake git libclang-dev pkg-config
3327

34-
# Build datafusion-cli so ./query can return actual rows instead of
35-
# the bench driver's JSON timing blob. See datafusion-vortex/install.
36-
if ! command -v datafusion-cli >/dev/null 2>&1; then
37-
cargo install --locked --version 49.0.2 datafusion-cli
38-
# Cargo installs into $HOME/.cargo/bin; the playground agent
39-
# runs scripts with a stripped PATH, so symlink into /usr/local/bin.
40-
sudo ln -sf "$HOME/.cargo/bin/datafusion-cli" /usr/local/bin/datafusion-cli
28+
if [ ! -d vortex-datafusion-cli ]; then
29+
git clone https://github.com/vortex-data/vortex-datafusion-cli.git
30+
fi
31+
cd vortex-datafusion-cli
32+
git fetch --tags
33+
git checkout "$CLI_TAG"
34+
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \
35+
cargo build --release --bin vortex-datafusion-cli
4136
fi

datafusion-vortex-partitioned/load

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,23 @@
11
#!/bin/bash
22
set -e
33

4-
# ./query uses datafusion-cli against the partitioned parquet files
5-
# under partitioned/. See datafusion-vortex/load for the rationale.
4+
DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli
5+
66
mkdir -p partitioned
7-
../lib/download-hits-parquet-partitioned partitioned
7+
mv hits_*.parquet partitioned/ 2>/dev/null || true
8+
9+
rm -rf vortex
10+
mkdir -p vortex
11+
12+
seq 0 99 | xargs -P"$(nproc)" -I{} "$DF" -q \
13+
-c "SET datafusion.execution.target_partitions = 1;" \
14+
-c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION 'partitioned/hits_{}.parquet' OPTIONS ('binary_as_string' 'true');" \
15+
-c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO 'vortex/hits_{}.vortex' STORED AS VORTEX;"
16+
17+
files=$(find vortex -maxdepth 1 -name 'hits_*.vortex' | wc -l)
18+
if [ "$files" -ne 100 ]; then
19+
echo "Expected 100 Vortex files, found $files" >&2
20+
exit 1
21+
fi
822

923
sync
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
3+
# This script converts the raw `result.csv` data from `benchmark.sh` into the
4+
# final json format used by the benchmark dashboard.
5+
#
6+
# usage : ./make-json.sh <machine>
7+
#
8+
# example (save results/<YYYYMMDD>/c6a.4xlarge.json)
9+
# ./make-json.sh c6a.4xlarge
10+
11+
MACHINE=$1
12+
DATE=$(date -u +%Y-%m-%d)
13+
YYYYMMDD=${DATE//-/}
14+
mkdir -p "results/${YYYYMMDD}"
15+
OUTPUT_FILE="results/${YYYYMMDD}/${MACHINE}.json"
16+
SYSTEM_NAME="DataFusion (Vortex, partitioned)"
17+
LOAD_TIME=${LOAD_TIME:-null}
18+
DATA_SIZE=${DATA_SIZE:-$(./data-size 2>/dev/null || echo null)}
19+
DATA_SIZE=${DATA_SIZE:-null}
20+
21+
# Read the CSV and build the result array using sed
22+
RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i<length(arr)) printf ",\n"}}' result.csv)
23+
24+
# form the final JSON structure from the template
25+
cat <<EOF > $OUTPUT_FILE
26+
{
27+
"system": "$SYSTEM_NAME",
28+
"date": "$DATE",
29+
"machine": "$MACHINE",
30+
"cluster_size": 1,
31+
"proprietary": "no",
32+
"tuned": "no",
33+
"hardware": "cpu",
34+
"tags": ["Rust","column-oriented","embedded","stateless"],
35+
"load_time": $LOAD_TIME,
36+
"data_size": $DATA_SIZE,
37+
"result": [
38+
$RESULT_ARRAY
39+
]
40+
}
41+
EOF

datafusion-vortex-partitioned/queries.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPh
1616
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
1717
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
1818
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
19-
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
19+
SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
2020
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
2121
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
2222
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
@@ -40,4 +40,4 @@ SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventD
4040
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
4141
SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
4242
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43-
SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
43+
SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,21 @@
11
#!/bin/bash
2-
# Reads a SQL query from stdin, runs it via datafusion-cli against the
3-
# partitioned parquet files. See ../datafusion-vortex/query for the
4-
# rationale; the vortex bench binary is benchmark-only.
5-
# Stdout: query result.
6-
# Stderr: query runtime in fractional seconds on the last line.
2+
# Reads a SQL query from stdin and runs it via vortex-datafusion-cli.
3+
# Stdout: query result. Stderr: query runtime in fractional seconds.
74
set -e
85

6+
DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli
7+
98
query=$(cat)
10-
tmp=$(mktemp /tmp/datafusion.XXXXXX.sql)
9+
tmp=$(mktemp /tmp/datafusion-vortex.XXXXXX.sql)
1110
trap 'rm -f "$tmp"' EXIT
1211
printf '%s\n' "$query" > "$tmp"
1312

14-
out=$(datafusion-cli -f create.sql "$tmp" 2>&1) && status=0 || status=$?
13+
out=$("$DF" -f create.sql -f "$tmp" 2>&1) && status=0 || status=$?
1514

1615
if [ "$status" -ne 0 ]; then
1716
printf '%s\n' "$out" >&2
1817
exit "$status"
1918
fi
2019

2120
printf '%s\n' "$out" | grep -v 'Elapsed' || true
22-
2321
printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2

0 commit comments

Comments
 (0)