Skip to content

Commit ccbf4c5

Browse files
committed
Initial setup
1 parent 1192520 commit ccbf4c5

22 files changed

Lines changed: 517 additions & 0 deletions
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# DataFusion + Vortex
2+
3+
Partitioned Vortex dataset, converted one-for-one from the 100 ClickBench Parquet files and queried with [`vortex-datafusion-cli`].
4+
5+
[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli
6+
7+
## Cookbook: Generate benchmark results
8+
9+
Follow the same EC2 setup used by [datafusion-partitioned](../datafusion-partitioned/README.md), then run:
10+
11+
```bash
12+
cd ClickBench/datafusion-vortex-partitioned
13+
bash benchmark.sh
14+
```
15+
16+
The benchmark script builds `vortex-datafusion-cli`, downloads the partitioned Parquet files, converts each `partitioned/hits_N.parquet` file into exactly one `vortex/hits_N.vortex` file, and runs the query set.
17+
18+
## Parquet to Vortex conversion
19+
20+
Each input file is converted independently through `vortex-datafusion-cli`:
21+
22+
```sql
23+
CREATE EXTERNAL TABLE hits_parquet
24+
STORED AS PARQUET
25+
LOCATION 'partitioned/hits_0.parquet'
26+
OPTIONS ('binary_as_string' 'true');
27+
28+
COPY (
29+
SELECT * EXCEPT ("EventDate"),
30+
CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
31+
FROM hits_parquet
32+
) TO 'vortex/hits_0.vortex' STORED AS VORTEX;
33+
```
34+
35+
`binary_as_string=true` handles the incorrect Parquet logical annotation before Vortex is written. The produced Vortex files store those fields as strings, so benchmark reads use only the Vortex table registration.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/bin/bash
2+
3+
set -Eeuo pipefail
4+
5+
export HOME=${HOME:=~}
6+
# <vortex-version>-<df-version>
7+
CLI_TAG=0.70.0-53.1.0
8+
WITH_SWAP=false
9+
10+
if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then
11+
echo "LOW MEMORY MODE"
12+
if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then
13+
echo "Enabling 8G swap"
14+
sudo fallocate -l 8G /swapfile
15+
sudo chmod 600 /swapfile
16+
sudo mkswap /swapfile
17+
sudo swapon /swapfile
18+
WITH_SWAP=true
19+
fi
20+
fi
21+
22+
echo "Install Rust"
23+
if ! command -v cargo >/dev/null 2>&1; then
24+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
25+
bash rust-init.sh -y
26+
fi
27+
source "$HOME/.cargo/env"
28+
29+
echo "Install dependencies"
30+
sudo apt-get update -y
31+
sudo apt-get install -y build-essential cmake pkg-config time
32+
33+
echo "Install vortex-datafusion-cli"
34+
rm -rf vortex-datafusion-cli
35+
git clone https://github.com/vortex-data/vortex-datafusion-cli.git
36+
cd vortex-datafusion-cli
37+
git checkout "$CLI_TAG"
38+
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --bin vortex-datafusion-cli
39+
export PATH="$(pwd)/target/release:$PATH"
40+
cd ..
41+
42+
echo "Download benchmark target data, partitioned"
43+
mkdir -p partitioned
44+
seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
45+
46+
echo "Convert Parquet to Vortex"
47+
rm -rf vortex
48+
mkdir -p vortex
49+
if ! /usr/bin/time -f '%e' -o load-time.txt bash -c '
50+
set -Eeuo pipefail
51+
seq 0 99 | xargs -P"$(nproc)" -I{} ./convert.sh "partitioned/hits_{}.parquet" "vortex/hits_{}.vortex"
52+
' > convert.log 2>&1; then
53+
cat convert.log
54+
exit 1
55+
fi
56+
VORTEX_FILES=$(find vortex -maxdepth 1 -name 'hits_*.vortex' | wc -l)
57+
if [ "$VORTEX_FILES" -ne 100 ]; then
58+
echo "Expected 100 Vortex files, found $VORTEX_FILES" >&2
59+
exit 1
60+
fi
61+
echo "Load time: $(cat load-time.txt)"
62+
63+
echo "Run benchmarks for partitioned"
64+
./run.sh
65+
66+
echo "Data size: $(du -bcs vortex/*.vortex | grep total)"
67+
68+
if [ "$WITH_SWAP" = true ]; then
69+
echo "Disable swap"
70+
sudo swapoff /swapfile
71+
sudo rm /swapfile
72+
fi
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
set -Eeuo pipefail
4+
5+
if [ "$#" -ne 2 ]; then
6+
echo "Usage: $0 <input.parquet> <output.vortex>" >&2
7+
exit 2
8+
fi
9+
10+
INPUT=$1
11+
OUTPUT=$2
12+
13+
mkdir -p "$(dirname "$OUTPUT")"
14+
rm -f "$OUTPUT"
15+
16+
vortex-datafusion-cli -q \
17+
-c "SET datafusion.execution.target_partitions = 1;" \
18+
-c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION '$INPUT' OPTIONS ('binary_as_string' 'true');" \
19+
-c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO '$OUTPUT' STORED AS VORTEX;"
20+
21+
test -f "$OUTPUT"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
CREATE EXTERNAL TABLE hits
2+
STORED AS VORTEX
3+
LOCATION 'vortex';
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM hits;
2+
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
3+
SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
4+
SELECT AVG("UserID") FROM hits;
5+
SELECT COUNT(DISTINCT "UserID") FROM hits;
6+
SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
7+
SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
8+
SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
9+
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
10+
SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
11+
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
12+
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
13+
SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
14+
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
15+
SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
16+
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
19+
SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
21+
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
22+
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
23+
SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
25+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
26+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
27+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
28+
SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
31+
SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
32+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
33+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
34+
SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
35+
SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
36+
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
38+
SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
39+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2026-05-05",
4+
"machine": "c6a.2xlarge",
5+
"cluster_size": 1,
6+
"proprietary": "no",
7+
"hardware": "cpu",
8+
"tuned": "no",
9+
"tags": ["Rust","column-oriented","embedded","stateless"],
10+
"load_time": null,
11+
"data_size": null,
12+
"result": []
13+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2026-05-05",
4+
"machine": "c6a.4xlarge",
5+
"cluster_size": 1,
6+
"proprietary": "no",
7+
"hardware": "cpu",
8+
"tuned": "no",
9+
"tags": ["Rust","column-oriented","embedded","stateless"],
10+
"load_time": null,
11+
"data_size": null,
12+
"result": []
13+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2026-05-05",
4+
"machine": "c6a.xlarge",
5+
"cluster_size": 1,
6+
"proprietary": "no",
7+
"hardware": "cpu",
8+
"tuned": "no",
9+
"tags": ["Rust","column-oriented","embedded","stateless"],
10+
"load_time": null,
11+
"data_size": null,
12+
"result": []
13+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2026-05-05",
4+
"machine": "c8g.4xlarge",
5+
"cluster_size": 1,
6+
"proprietary": "no",
7+
"hardware": "cpu",
8+
"tuned": "no",
9+
"tags": ["Rust","column-oriented","embedded","stateless"],
10+
"load_time": null,
11+
"data_size": null,
12+
"result": []
13+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
TRIES=3
4+
QUERY_NUM=1
5+
: > result.csv
6+
7+
cat queries.sql | while read -r query; do
8+
sync
9+
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
10+
11+
echo "$query" > /tmp/query.sql
12+
13+
echo -n "["
14+
for i in $(seq 1 $TRIES); do
15+
RES=$(vortex-datafusion-cli -f create.sql -f /tmp/query.sql 2>&1 | grep "Elapsed" | tail -1 | awk '{ print $2 }')
16+
[[ $RES != "" ]] && \
17+
echo -n "$RES" || \
18+
echo -n "null"
19+
[[ "$i" != $TRIES ]] && echo -n ", "
20+
echo "${QUERY_NUM},${i},${RES}" >> result.csv
21+
done
22+
echo "],"
23+
24+
QUERY_NUM=$((QUERY_NUM + 1))
25+
done

0 commit comments

Comments
 (0)