Skip to content

Commit 7ff0d87

Browse files
authored
run each benchmark type in random access separately (#8470)
If we run vortex solely for feature-vectors in cached mode, runtime in CI-like box is 400mus. If we run it alongside lance, runtime is 2500mus. Run each format, cache mode, dataset, and kind separately in CI, and aggreate results to a file as before Signed-off-by: Mikhail Kot <mikhail@spiraldb.com>
1 parent 6a3fb19 commit 7ff0d87

3 files changed

Lines changed: 124 additions & 0 deletions

File tree

.github/workflows/bench-pr.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,18 @@ jobs:
7272
profiling_frequency: 199
7373
extra_args: "--off-cpu-threshold=0.03" # Personally tuned by @brancz
7474

75+
- name: Run ${{ matrix.benchmark.name }} benchmark (per-combination)
76+
if: matrix.benchmark.id == 'random-access-bench'
77+
shell: bash
78+
env:
79+
RUST_BACKTRACE: full
80+
VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1"
81+
FLAT_LAYOUT_INLINE_ARRAY_NODE: "1"
82+
run: |
83+
python3 scripts/random-access-split.py
84+
7585
- name: Run ${{ matrix.benchmark.name }} benchmark
86+
if: matrix.benchmark.id != 'random-access-bench'
7687
shell: bash
7788
env:
7889
RUST_BACKTRACE: full

.github/workflows/bench.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,18 @@ jobs:
8585
profiling_frequency: 199
8686
extra_args: "--off-cpu-threshold=0.03" # Personally tuned by @brancz
8787

88+
- name: Run ${{ matrix.benchmark.name }} benchmark (per-combination)
89+
if: matrix.benchmark.id == 'random-access-bench'
90+
shell: bash
91+
env:
92+
RUST_BACKTRACE: full
93+
VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1"
94+
FLAT_LAYOUT_INLINE_ARRAY_NODE: "1"
95+
run: |
96+
python3 scripts/random-access-split.py --v3
97+
8898
- name: Run ${{ matrix.benchmark.name }} benchmark
99+
if: matrix.benchmark.id != 'random-access-bench'
89100
shell: bash
90101
env:
91102
RUST_BACKTRACE: full

scripts/random-access-split.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
"""
5+
Run random-access-bench once per (dataset, format, pattern, open-mode)
6+
then merge the per-combination outputs
7+
"""
8+
9+
import argparse
10+
import glob
11+
import json
12+
import subprocess
13+
from collections.abc import Callable
14+
from pathlib import Path
15+
16+
SCRIPT_DIR = Path(__file__).resolve().parent
17+
BINARY = "target/release_debug/random-access-bench"
18+
PARTS_DIR = Path("parts")
19+
20+
DATASETS = ["taxi", "feature-vectors", "nested-lists", "nested-structs"]
21+
FORMATS = ["parquet", "lance", "vortex"]
22+
PATTERNS = ["correlated", "uniform"]
23+
OPEN_MODES = ["cached", "reopen"]
24+
25+
26+
def run_combinations(emit_v3: bool) -> None:
27+
PARTS_DIR.mkdir(parents=True, exist_ok=True)
28+
i = 0
29+
for dataset in DATASETS:
30+
for fmt in FORMATS:
31+
for pattern in PATTERNS:
32+
for open_mode in OPEN_MODES:
33+
args = [
34+
"bash",
35+
str(SCRIPT_DIR / "bench-taskset.sh"),
36+
BINARY,
37+
"--datasets",
38+
dataset,
39+
"--formats",
40+
fmt,
41+
"--patterns",
42+
pattern,
43+
"--open-mode",
44+
open_mode,
45+
"-d",
46+
"gh-json",
47+
"-o",
48+
str(PARTS_DIR / f"{i}.gh.json"),
49+
]
50+
if emit_v3:
51+
args += ["--gh-json-v3", str(PARTS_DIR / f"{i}.v3.jsonl")]
52+
print("+", " ".join(args), flush=True)
53+
subprocess.run(args, check=True)
54+
i += 1
55+
56+
57+
"""
58+
This function exists only because of taxi-legacy.
59+
60+
Every taxi invocation re-emits the pattern-less legacy taxi rows, so we need
61+
the merge to drop the duplicates. Otherwise we could just merge JSONL lines.
62+
"""
63+
64+
65+
def merge(pattern: str, key: Callable[[dict], object], out_path: str) -> None:
66+
seen: set[object] = set()
67+
lines: list[str] = []
68+
for path in sorted(glob.glob(pattern)):
69+
with open(path, encoding="utf-8") as handle:
70+
for line in handle:
71+
line = line.strip()
72+
if not line:
73+
continue
74+
identity = key(json.loads(line))
75+
if identity in seen:
76+
continue
77+
seen.add(identity)
78+
lines.append(line)
79+
Path(out_path).write_text("".join(line + "\n" for line in lines), encoding="utf-8")
80+
81+
82+
def main() -> None:
83+
parser = argparse.ArgumentParser(description=__doc__)
84+
parser.add_argument(
85+
"--v3",
86+
action="store_true",
87+
help="merge --gh-json-v3 records into results.v3.jsonl",
88+
)
89+
args = parser.parse_args()
90+
91+
run_combinations(args.v3)
92+
merge(f"{PARTS_DIR}/*.gh.json", lambda record: record["name"], "results.json")
93+
if args.v3:
94+
merge(
95+
f"{PARTS_DIR}/*.v3.jsonl",
96+
lambda record: (record["kind"], record["dataset"], record["format"]),
97+
"results.v3.jsonl",
98+
)
99+
100+
101+
if __name__ == "__main__":
102+
main()

0 commit comments

Comments
 (0)