Skip to content

Commit 7b424a9

Browse files
committed
feat: enhance JVM configuration and memory management in benchmark scripts
1 parent a1054a1 commit 7b424a9

8 files changed

Lines changed: 113 additions & 34 deletions

File tree

bindings/python/build_and_install_locally.sh

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,29 @@ docker run --rm \
2222
-v "${REPO_ROOT}":/src \
2323
-w /src \
2424
maven:3.9-amazoncorretto-25 \
25-
sh -c "git config --global --add safe.directory /src && ./mvnw -DskipTests -pl package -am package"
25+
sh -c "if ! command -v git >/dev/null 2>&1 || ! command -v tar >/dev/null 2>&1; then \
26+
if command -v yum >/dev/null 2>&1; then \
27+
yum -y install git tar; \
28+
elif command -v apt-get >/dev/null 2>&1; then \
29+
apt-get update && apt-get install -y git tar; \
30+
else \
31+
echo '❌ git/tar not found and no supported package manager (yum/apt-get) available' >&2; \
32+
exit 1; \
33+
fi; \
34+
fi; \
35+
git config --global --add safe.directory /src && ./mvnw -DskipTests -pl package -am package"
2636

2737
# 2) Copy freshly built JARs into local-jars for the Python build
2838
log "Staging JARs into bindings/python/local-jars/lib..."
2939
mkdir -p "${LOCAL_JARS_DIR}"
30-
HEADLESS_LIB_DIR="${REPO_ROOT}/package/target/arcadedb-26.1.1-SNAPSHOT-headless.dir/arcadedb-26.1.1-SNAPSHOT/lib"
31-
if [[ ! -d "${HEADLESS_LIB_DIR}" ]]; then
32-
echo "❌ Headless package lib directory not found at ${HEADLESS_LIB_DIR}" >&2
40+
HEADLESS_PARENT_DIR=$(ls -d "${REPO_ROOT}/package/target/arcadedb-"*-headless.dir 2> /dev/null | sort | tail -n 1)
41+
if [[ -z "${HEADLESS_PARENT_DIR}" ]]; then
42+
echo "❌ Headless package directory not found under ${REPO_ROOT}/package/target" >&2
43+
exit 1
44+
fi
45+
HEADLESS_LIB_DIR=$(ls -d "${HEADLESS_PARENT_DIR}"/arcadedb-*/lib 2> /dev/null | sort | tail -n 1)
46+
if [[ -z "${HEADLESS_LIB_DIR}" || ! -d "${HEADLESS_LIB_DIR}" ]]; then
47+
echo "❌ Headless package lib directory not found under ${HEADLESS_PARENT_DIR}" >&2
3348
exit 1
3449
fi
3550
cp "${HEADLESS_LIB_DIR}"/*.jar "${LOCAL_JARS_DIR}/"
@@ -43,7 +58,7 @@ cd "${PY_BINDINGS_DIR}"
4358

4459
# 4) Install the wheel with uv (force reinstall)
4560
log "Installing wheel via uv pip..."
46-
WHEEL_PATH=$(ls -1 dist/arcadedb_embedded-*-linux_x86_64.whl | sort | tail -n 1)
61+
WHEEL_PATH=$(ls -1 dist/arcadedb_embedded-*.whl | sort | tail -n 1)
4762
if [[ -z "${WHEEL_PATH}" ]]; then
4863
echo "❌ No wheel found in dist/. Did the build succeed?" >&2
4964
exit 1

bindings/python/examples/benchmark-vector/benchmark_arcadedb_msmarco.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,16 @@ def _loop():
8989
return stop_event
9090

9191

92-
def heap_tag_from_env() -> str:
93-
"""Return heap tag derived from ARCADEDB_JVM_ARGS (-Xmx), or 'default'."""
92+
def heap_tag_from_args(heap_size: str | None, jvm_args: str | None) -> str:
93+
"""Return heap tag derived from explicit args, or 'default'."""
9494

95-
jvm_args = os.environ.get("ARCADEDB_JVM_ARGS", "")
96-
m = re.search(r"-Xmx(\S+)", jvm_args)
97-
if not m:
98-
return "default"
99-
return m.group(1)
95+
if heap_size:
96+
return heap_size
97+
if jvm_args:
98+
m = re.search(r"-Xmx(\S+)", jvm_args)
99+
if m:
100+
return m.group(1)
101+
return "default"
100102

101103

102104
# -------------------------
@@ -466,16 +468,38 @@ def main():
466468
default=100_000,
467469
help="Number of vectors to ingest per transaction batch",
468470
)
471+
ap.add_argument(
472+
"--heap-size",
473+
default=None,
474+
help=(
475+
"Heap size for the embedded JVM (e.g., 8g). Prefer this over "
476+
"ARCADEDB_JVM_ARGS."
477+
),
478+
)
479+
ap.add_argument(
480+
"--jvm-args",
481+
default=None,
482+
help=(
483+
"Extra JVM args to pass to start_jvm (e.g., thread flags). "
484+
"Prefer this over ARCADEDB_JVM_ARGS."
485+
),
486+
)
469487
args = ap.parse_args()
470488

471489
stop_cpu = start_cpu_logger(2)
472490

473491
np.random.seed(args.seed)
474492
eval_k = 50
475493

476-
# Import after potential JVM arg override
494+
# Import and configure JVM before any DB creation
477495
import arcadedb_embedded as arcadedb
478496

497+
jvm_kwargs = {}
498+
if args.heap_size is not None:
499+
jvm_kwargs["heap_size"] = args.heap_size
500+
if args.jvm_args is not None:
501+
jvm_kwargs["jvm_args"] = args.jvm_args
502+
479503
sources, gt_path, dim, label = resolve_dataset(Path(args.dataset_dir))
480504
total_rows = sum(s["count"] for s in sources)
481505
count = args.count if args.count is not None else total_rows
@@ -513,7 +537,7 @@ def record(name: str, result, dur, rss_start, rss_end):
513537
record("load_queries", {"queries": len(queries)}, dur, r0, r1)
514538

515539
# Prepare DB path
516-
heap_tag = heap_tag_from_env()
540+
heap_tag = heap_tag_from_args(args.heap_size, args.jvm_args)
517541
param_dir = "_".join(
518542
[
519543
f"dataset={Path(args.dataset_dir).name}",
@@ -537,7 +561,8 @@ def record(name: str, result, dur, rss_start, rss_end):
537561

538562
# Create DB
539563
(db, dur, r0, r1) = timed_section(
540-
"create_db", lambda: arcadedb.create_database(str(db_path))
564+
"create_db",
565+
lambda: arcadedb.create_database(str(db_path), jvm_kwargs=jvm_kwargs),
541566
)
542567
record("create_db", {"db_path": str(db_path)}, dur, r0, r1)
543568

@@ -670,7 +695,8 @@ def record(name: str, result, dur, rss_start, rss_end):
670695

671696
# Reopen
672697
(db, dur, r0, r1) = timed_section(
673-
"open_db", lambda: arcadedb.open_database(str(db_path))
698+
"open_db",
699+
lambda: arcadedb.open_database(str(db_path), jvm_kwargs=jvm_kwargs),
674700
)
675701
record("open_db", {}, dur, r0, r1)
676702

bindings/python/examples/benchmark-vector/run_arcadedb_search_study.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,9 @@ def run_single(
9090
output_root: Path,
9191
base_config: Dict,
9292
heap_tag: str | None,
93+
jvm_kwargs: Dict,
94+
arcadedb,
9395
) -> Path:
94-
import arcadedb_embedded as arcadedb
9596

9697
sources, gt_path, dim, label = resolve_dataset(dataset_dir)
9798
total_rows = sum(s["count"] for s in sources)
@@ -111,7 +112,8 @@ def run_single(
111112
record(phases, "load_queries", {"queries": len(queries)}, dur, r0, r1)
112113

113114
(db, dur, r0, r1) = timed_section(
114-
"open_db", lambda: arcadedb.open_database(str(db_path))
115+
"open_db",
116+
lambda: arcadedb.open_database(str(db_path), jvm_kwargs=jvm_kwargs),
115117
)
116118
record(phases, "open_db", {}, dur, r0, r1)
117119

@@ -288,6 +290,22 @@ def main() -> None:
288290
"--heap-tag",
289291
help="Heap tag to record in outputs (e.g., 8g). Defaults to heap from db path.",
290292
)
293+
ap.add_argument(
294+
"--heap-size",
295+
default=None,
296+
help=(
297+
"Heap size for the embedded JVM (e.g., 8g). Prefer this over "
298+
"ARCADEDB_JVM_ARGS."
299+
),
300+
)
301+
ap.add_argument(
302+
"--jvm-args",
303+
default=None,
304+
help=(
305+
"Extra JVM args to pass to start_jvm (e.g., thread flags). "
306+
"Prefer this over ARCADEDB_JVM_ARGS."
307+
),
308+
)
291309

292310
args = ap.parse_args()
293311

@@ -299,11 +317,19 @@ def main() -> None:
299317
if not db_path.exists():
300318
raise SystemExit(f"DB path not found: {db_path}")
301319

320+
import arcadedb_embedded as arcadedb
321+
322+
jvm_kwargs: Dict = {}
323+
if args.heap_size is not None:
324+
jvm_kwargs["heap_size"] = args.heap_size
325+
if args.jvm_args is not None:
326+
jvm_kwargs["jvm_args"] = args.jvm_args
327+
302328
base_config = load_existing_config(db_path)
303329
quant = (base_config.get("quantization") or "NONE").upper()
304330

305331
fallback_heap = run_dir_value(db_path.name, "heap")
306-
heap_tag = args.heap_tag or fallback_heap
332+
heap_tag = args.heap_tag or args.heap_size or fallback_heap
307333

308334
created: List[Path] = []
309335
for oq in overqueries:
@@ -317,6 +343,8 @@ def main() -> None:
317343
output_root=output_root,
318344
base_config=base_config,
319345
heap_tag=heap_tag,
346+
jvm_kwargs=jvm_kwargs,
347+
arcadedb=arcadedb,
320348
)
321349
)
322350

bindings/python/examples/benchmark-vector/run_arcadedb_search_study.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,17 @@ for HEAP in "${HEAPS[@]}"; do
5858
fi
5959

6060
echo ">>> RUN: heap=${HEAP} threads=${THREADS} overquery=${OVERQUERY_CSV}" >&2
61-
ARCADEDB_JVM_ARGS="$JVM_ARGS" \
62-
python "$PY" \
61+
HEAP_FLAG=()
62+
if [[ "$HEAP" != "default" ]]; then
63+
HEAP_FLAG=("--heap-size" "$HEAP")
64+
fi
65+
66+
python "$PY" \
6367
--db-path "$DB_PATH" \
6468
--dataset-dir "$DATASET_DIR" \
6569
--overquery-factors "$OVERQUERY_CSV" \
6670
--output-root "$OUTPUT_ROOT" \
67-
--heap-tag "$HEAP"
71+
--heap-tag "$HEAP" \
72+
--jvm-args "$JVM_ARGS" \
73+
"${HEAP_FLAG[@]}"
6874
done

bindings/python/examples/benchmark-vector/run_arcadedb_sweep.sh

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ if [[ ! -f "$BENCH_PY" ]]; then
2323
exit 1
2424
fi
2525

26-
# Heap selection: use ARCADEDB_JVM_ARGS if set, otherwise sweep HEAP_SIZES
27-
PRESET_JVM_ARGS="${ARCADEDB_JVM_ARGS:-}"
26+
# Heap selection: use BENCH_JVM_ARGS if set, otherwise sweep HEAP_SIZES
27+
PRESET_JVM_ARGS="${BENCH_JVM_ARGS:-}"
2828
HEAP_SIZES_CSV="${HEAP_SIZES:-}"
2929
HEAP_SIZES=()
3030

@@ -43,21 +43,21 @@ HEAP_SIZES=()
4343
if [[ -z "$PRESET_JVM_ARGS" ]]; then
4444
# Default heap sweep per dataset (override by setting HEAP_SIZES="8g,12g,16g")
4545
case "$(basename "$DATASET_DIR")" in
46-
# *MSMARCO-100K*) HEAP_SIZES=("1g" "2g") ;; # 1g works fine. ran with 4 threads
47-
# *MSMARCO-1M*) HEAP_SIZES=("4g") ;; # below 4g heap is already really slow. ran with 4 threads
46+
*MSMARCO-100K*) HEAP_SIZES=("2g") ;; # 1g works fine. ran with 4 threads
47+
# *MSMARCO-1M*) HEAP_SIZES=("4g") ;; #
4848
# *MSMARCO-2M*) HEAP_SIZES=("8g") ;; # 4 threads
4949
# *MSMARCO-4M*) HEAP_SIZES=("16g") ;; # 4 threads
5050
# *MSMARCO-8M*) HEAP_SIZES=("32g") ;; # 4 threads
51-
# *MSMARCO-16M*) HEAP_SIZES=("32g") ;; # 4 threads
52-
*MSMARCO-32M*) HEAP_SIZES=("32g") ;; # 4 threads
51+
# *MSMARCO-16M*) HEAP_SIZES=("64g") ;; # 4 threads
52+
# *MSMARCO-32M*) HEAP_SIZES=("128g") ;; # 4 threads
5353
*) HEAP_SIZES=("default") ;;
5454
esac
5555

5656
if [[ -n "$HEAP_SIZES_CSV" ]]; then
5757
IFS="," read -ra HEAP_SIZES <<< "$HEAP_SIZES_CSV"
5858
fi
5959
else
60-
echo "Using pre-set ARCADEDB_JVM_ARGS='${PRESET_JVM_ARGS}'" >&2
60+
echo "Using pre-set BENCH_JVM_ARGS='${PRESET_JVM_ARGS}'" >&2
6161
HEAP_SIZES=("preset")
6262
fi
6363

@@ -106,11 +106,13 @@ for HEAP in "${HEAP_SIZES[@]}"; do
106106
fi
107107

108108
BASE_ENV="${THREAD_ENV}"
109+
BASE="${BASE_ENV} python \"${BENCH_PY}\" --dataset-dir \"${DATASET_DIR}\" --db-root \"${DB_ROOT}\""
109110
if [[ -n "$JVM_ARGS" ]]; then
110-
BASE_ENV="ARCADEDB_JVM_ARGS=\"${JVM_ARGS}\" ${BASE_ENV}"
111+
BASE="${BASE} --jvm-args \"${JVM_ARGS}\""
112+
fi
113+
if [[ "$HEAP" != "default" && "$HEAP" != "preset" ]]; then
114+
BASE="${BASE} --heap-size ${HEAP}"
111115
fi
112-
113-
BASE="${BASE_ENV} python \"${BENCH_PY}\" --dataset-dir \"${DATASET_DIR}\" --db-root \"${DB_ROOT}\""
114116

115117
for MC in "${MAX_CONNECTIONS[@]}"; do
116118
for BW in "${BEAM_WIDTHS[@]}"; do

bindings/python/examples/benchmark-vector/summarize_arcadedb_search_study.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def parse_record(path: Path) -> Dict | None:
7676
"heap": heap_tag,
7777
"overquery_factor": overquery,
7878
"quantization": cfg.get("quantization"),
79+
"add_hierarchy": cfg.get("add_hierarchy"),
7980
"max_connections": cfg.get("max_connections"),
8081
"beam_width": cfg.get("beam_width"),
8182
"search_s": phases.get("search", {}).get("time_sec"),
@@ -132,6 +133,7 @@ def df_to_markdown(df: pd.DataFrame) -> str:
132133
"heap",
133134
"overquery_factor",
134135
"quantization",
136+
"add_hierarchy",
135137
"max_connections",
136138
"beam_width",
137139
"recall@50",

bindings/python/examples/run_with_memory_monitor.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# ./run_with_memory_monitor.sh <log_prefix> <python_command>
77
#
88
# Example:
9-
# ./run_with_memory_monitor.sh vector_large "ARCADEDB_JVM_ARGS='-Xmx8g -Xms8g' python 06_vector_search_recommendations.py --source-db my_test_databases/movielens_graph_large_db --db-path my_test_databases/movielens_graph_large_db_vectors"
9+
# ./run_with_memory_monitor.sh vector_large "python 06_vector_search_recommendations.py --source-db my_test_databases/movielens_graph_large_db --db-path my_test_databases/movielens_graph_large_db_vectors"
1010
#
1111

1212
if [ $# -lt 2 ]; then

bindings/python/src/arcadedb_embedded/vector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def find_nearest_approximate(
241241
self,
242242
query_vector,
243243
k=10,
244-
overquery_factor=1,
244+
overquery_factor=4,
245245
allowed_rids=None,
246246
):
247247
"""

0 commit comments

Comments
 (0)