Skip to content

Commit 8bbc5e9

Browse files
Merge pull request #910 from ClickHouse/better-json-import
Unify JSON import
2 parents f279113 + 7503065 commit 8bbc5e9

8 files changed

Lines changed: 22 additions & 61 deletions

File tree

parseable/benchmark.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/bin/bash
22
# Thin shim — actual flow is in lib/benchmark-common.sh.
3-
# parseable ingests gzipped NDJSON; ./load fetches it directly.
4-
export BENCH_DOWNLOAD_SCRIPT=""
3+
export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
54
export BENCH_DURABLE=yes
65
exec ../lib/benchmark-common.sh

parseable/load

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,9 @@ set -eu
33

44
NUM_CORES=$(nproc)
55

6-
# Prefer the pre-decompressed hits.json shipped on the playground's
7-
# readonly dataset disk — it's a 217 GB symlink target, doesn't burn
8-
# the VM's 200 GB sparse sysdisk on a redundant gunzip. Fall back to
9-
# wget + gunzip for standalone use.
10-
if [ -f /opt/clickbench/datasets_ro/hits.json ]; then
11-
ln -sf /opt/clickbench/datasets_ro/hits.json hits.json
12-
elif [ -f /opt/clickbench/datasets_ro/hits.json.gz ]; then
13-
ln -sf /opt/clickbench/datasets_ro/hits.json.gz hits.json.gz
14-
FILE_SIZE=$(stat -L -c %s hits.json.gz)
15-
pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json
16-
else
17-
wget --continue --progress=dot:giga \
18-
'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
19-
FILE_SIZE=$(stat -L -c %s hits.json.gz)
20-
pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json
21-
fi
6+
# hits.json is delivered by the shared lib/download-hits-json helper
7+
# (symlink to the RO dataset on the playground, wget + pigz
8+
# elsewhere).
229

2310
# Create the stream first — ingest below needs it to exist.
2411
curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \
@@ -45,8 +32,8 @@ pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \
4532
--data-binary @- >/dev/null
4633
'
4734

48-
# Drop the symlink to the RO dataset — no chunk files to clean up.
49-
rm -f hits.json hits.json.gz
35+
# Drop the symlink/file delivered by lib/download-hits-json.
36+
rm -f hits.json
5037

5138
# Allow sync to complete.
5239
sleep 180

quickwit/benchmark.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#!/bin/bash
22
# Thin shim — actual flow is in lib/benchmark-common.sh.
3-
# Quickwit takes Elasticsearch-format JSON queries; the load script fetches
4-
# hits.json.gz directly so no shared download-hits-* script applies.
5-
export BENCH_DOWNLOAD_SCRIPT=""
3+
# Quickwit takes Elasticsearch-format JSON queries.
4+
export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
65
export BENCH_DURABLE=yes
76
export BENCH_QUERIES_FILE="queries.json"
87
exec ../lib/benchmark-common.sh

quickwit/load

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,18 @@ curl -sS -X POST http://localhost:7280/api/v1/indexes \
88
-H 'Content-Type: application/yaml' \
99
--data-binary @index_config.yaml | jq -r '.index_uid // .message'
1010

11-
# No download-hits-json shared script; fetch directly. ~14 GB compressed.
12-
wget --continue -q 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
13-
1411
# Use `quickwit tool local-ingest` instead of the Elasticsearch-compatible
1512
# bulk endpoint. v0.9's sharded ingest-v2 API caps single-node throughput
1613
# to a few MB/s; local-ingest builds splits directly and writes them to
1714
# the index storage. The running server picks up new splits on its next
1815
# metastore poll (default 30s).
1916
#
17+
# hits.json is delivered by the shared lib/download-hits-json helper.
2018
# Throttle the per-second "Num docs ... Thrghput ... Time" progress lines
2119
# to once per ~30s so the captured log stays compact.
22-
zcat hits.json.gz | sudo docker run --rm -i --network host \
20+
sudo docker run --rm -i --network host \
2321
-v "$PWD/qwdata":/quickwit/qwdata \
24-
"$QW_IMAGE" tool local-ingest --index hits -y 2>&1 \
22+
"$QW_IMAGE" tool local-ingest --index hits -y < hits.json 2>&1 \
2523
| awk '/Num docs/ { n = systime(); if (n - last >= 30) { print; fflush(); last = n } next }
2624
{ print; fflush() }'
2725

@@ -31,5 +29,5 @@ sleep 35
3129
curl -sS "http://localhost:7280/api/v1/indexes/hits/describe" \
3230
| jq '{num_published_docs, num_published_splits, size_published_splits}'
3331

34-
rm -f hits.json.gz
32+
rm -f hits.json
3533
sync

siglens/benchmark.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/bin/bash
22
# Thin shim — actual flow is in lib/benchmark-common.sh.
3-
# siglens ingests its own gzipped NDJSON; ./load fetches it directly.
4-
export BENCH_DOWNLOAD_SCRIPT=""
3+
export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
54
export BENCH_DURABLE=yes
65
# queries are SPL/Splunk QL, not SQL.
76
export BENCH_QUERIES_FILE="queries.spl"

siglens/load

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,9 @@
11
#!/bin/bash
22
set -eu
33

4-
# Prefer the pre-decompressed hits.json shipped on the playground's
5-
# readonly dataset disk — decompressing the 22 GB hits.json.gz into
6-
# a 217 GB hits.json blows past the VM's sparse rootfs budget and
7-
# kills the load with "No space left on device". Fall back to wget +
8-
# pigz for standalone use.
9-
if [ -f /opt/clickbench/datasets_ro/hits.json ]; then
10-
ln -sf /opt/clickbench/datasets_ro/hits.json hits.json
11-
else
12-
wget --continue --progress=dot:giga \
13-
'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
14-
pigz -d -f hits.json.gz
15-
fi
16-
4+
# hits.json is delivered by the shared lib/download-hits-json helper
5+
# (symlink to the RO dataset on the playground, wget + pigz
6+
# elsewhere).
177
python3 send_datawithactionline.py
188

199
rm -f hits.json

victorialogs/benchmark.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/bin/bash
22
# Thin shim — actual flow is in lib/benchmark-common.sh.
3-
# victorialogs ingests gzipped NDJSON; ./load fetches it directly.
4-
export BENCH_DOWNLOAD_SCRIPT=""
3+
export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
54
export BENCH_DURABLE=yes
65
# queries are LogsQL, not SQL.
76
export BENCH_QUERIES_FILE="queries.logsql"

victorialogs/load

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,10 @@ for _ in {1..300}; do
1111
sleep 1
1212
done
1313

14-
# Use the pre-decompressed hits.json on the playground's readonly
15-
# dataset disk (no temp file, no 75 GB gunzip in the VM). Fall back
16-
# to wget + gunzip when running standalone.
17-
if [ -f /opt/clickbench/datasets_ro/hits.json ]; then
18-
HITS=/opt/clickbench/datasets_ro/hits.json
19-
else
20-
wget --continue --progress=dot:giga \
21-
'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
22-
gunzip -f hits.json.gz
23-
HITS=hits.json
24-
fi
14+
# Bulk insert via 8 parallel jsonline streams. hits.json is delivered
15+
# by the shared lib/download-hits-json helper (symlink to the RO
16+
# dataset on the playground, wget + pigz elsewhere).
17+
split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" hits.json
2518

26-
# Bulk insert via 8 parallel jsonline streams.
27-
split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" "$HITS"
28-
29-
[ "$HITS" = "hits.json" ] && rm -f hits.json
19+
rm -f hits.json
3020
sync

0 commit comments

Comments
 (0)