Merge pull request #910 from ClickHouse/better-json-import

alexey-milovidov · web-flow · commit 8bbc5e9b84c0 · 2026-05-16T19:59:47.000+02:00
Unify JSON import
diff --git a/parseable/benchmark.sh b/parseable/benchmark.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 # Thin shim — actual flow is in lib/benchmark-common.sh.
-# parseable ingests gzipped NDJSON; ./load fetches it directly.
-export BENCH_DOWNLOAD_SCRIPT=""
+export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
 export BENCH_DURABLE=yes
 exec ../lib/benchmark-common.sh
diff --git a/parseable/load b/parseable/load
@@ -3,22 +3,9 @@ set -eu
 
 NUM_CORES=$(nproc)
 
-# Prefer the pre-decompressed hits.json shipped on the playground's
-# readonly dataset disk — it's a 217 GB symlink target, doesn't burn
-# the VM's 200 GB sparse sysdisk on a redundant gunzip. Fall back to
-# wget + gunzip for standalone use.
-if [ -f /opt/clickbench/datasets_ro/hits.json ]; then
-    ln -sf /opt/clickbench/datasets_ro/hits.json hits.json
-elif [ -f /opt/clickbench/datasets_ro/hits.json.gz ]; then
-    ln -sf /opt/clickbench/datasets_ro/hits.json.gz hits.json.gz
-    FILE_SIZE=$(stat -L -c %s hits.json.gz)
-    pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json
-else
-    wget --continue --progress=dot:giga \
-        'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
-    FILE_SIZE=$(stat -L -c %s hits.json.gz)
-    pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json
-fi
+# hits.json is delivered by the shared lib/download-hits-json helper
+# (symlink to the RO dataset on the playground, wget + pigz
+# elsewhere).
 
 # Create the stream first — ingest below needs it to exist.
 curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \
@@ -45,8 +32,8 @@ pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \
             --data-binary @- >/dev/null
     '
 
-# Drop the symlink to the RO dataset — no chunk files to clean up.
-rm -f hits.json hits.json.gz
+# Drop the symlink/file delivered by lib/download-hits-json.
+rm -f hits.json
 
 # Allow sync to complete.
 sleep 180
diff --git a/quickwit/benchmark.sh b/quickwit/benchmark.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 # Thin shim — actual flow is in lib/benchmark-common.sh.
-# Quickwit takes Elasticsearch-format JSON queries; the load script fetches
-# hits.json.gz directly so no shared download-hits-* script applies.
-export BENCH_DOWNLOAD_SCRIPT=""
+# Quickwit takes Elasticsearch-format JSON queries.
+export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
 export BENCH_DURABLE=yes
 export BENCH_QUERIES_FILE="queries.json"
 exec ../lib/benchmark-common.sh
diff --git a/quickwit/load b/quickwit/load
@@ -8,20 +8,18 @@ curl -sS -X POST http://localhost:7280/api/v1/indexes \
     -H 'Content-Type: application/yaml' \
     --data-binary @index_config.yaml | jq -r '.index_uid // .message'
 
-# No download-hits-json shared script; fetch directly. ~14 GB compressed.
-wget --continue -q 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
-
 # Use `quickwit tool local-ingest` instead of the Elasticsearch-compatible
 # bulk endpoint. v0.9's sharded ingest-v2 API caps single-node throughput
 # to a few MB/s; local-ingest builds splits directly and writes them to
 # the index storage. The running server picks up new splits on its next
 # metastore poll (default 30s).
 #
+# hits.json is delivered by the shared lib/download-hits-json helper.
 # Throttle the per-second "Num docs ... Thrghput ... Time" progress lines
 # to once per ~30s so the captured log stays compact.
-zcat hits.json.gz | sudo docker run --rm -i --network host \
+sudo docker run --rm -i --network host \
     -v "$PWD/qwdata":/quickwit/qwdata \
-    "$QW_IMAGE" tool local-ingest --index hits -y 2>&1 \
+    "$QW_IMAGE" tool local-ingest --index hits -y < hits.json 2>&1 \
     | awk '/Num docs/ { n = systime(); if (n - last >= 30) { print; fflush(); last = n } next }
            { print; fflush() }'
 
@@ -31,5 +29,5 @@ sleep 35
 curl -sS "http://localhost:7280/api/v1/indexes/hits/describe" \
     | jq '{num_published_docs, num_published_splits, size_published_splits}'
 
-rm -f hits.json.gz
+rm -f hits.json
 sync
diff --git a/siglens/benchmark.sh b/siglens/benchmark.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 # Thin shim — actual flow is in lib/benchmark-common.sh.
-# siglens ingests its own gzipped NDJSON; ./load fetches it directly.
-export BENCH_DOWNLOAD_SCRIPT=""
+export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
 export BENCH_DURABLE=yes
 # queries are SPL/Splunk QL, not SQL.
 export BENCH_QUERIES_FILE="queries.spl"
diff --git a/siglens/load b/siglens/load
@@ -1,19 +1,9 @@
 #!/bin/bash
 set -eu
 
-# Prefer the pre-decompressed hits.json shipped on the playground's
-# readonly dataset disk — decompressing the 22 GB hits.json.gz into
-# a 217 GB hits.json blows past the VM's sparse rootfs budget and
-# kills the load with "No space left on device". Fall back to wget +
-# pigz for standalone use.
-if [ -f /opt/clickbench/datasets_ro/hits.json ]; then
-    ln -sf /opt/clickbench/datasets_ro/hits.json hits.json
-else
-    wget --continue --progress=dot:giga \
-        'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
-    pigz -d -f hits.json.gz
-fi
-
+# hits.json is delivered by the shared lib/download-hits-json helper
+# (symlink to the RO dataset on the playground, wget + pigz
+# elsewhere).
 python3 send_datawithactionline.py
 
 rm -f hits.json
diff --git a/victorialogs/benchmark.sh b/victorialogs/benchmark.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 # Thin shim — actual flow is in lib/benchmark-common.sh.
-# victorialogs ingests gzipped NDJSON; ./load fetches it directly.
-export BENCH_DOWNLOAD_SCRIPT=""
+export BENCH_DOWNLOAD_SCRIPT="download-hits-json"
 export BENCH_DURABLE=yes
 # queries are LogsQL, not SQL.
 export BENCH_QUERIES_FILE="queries.logsql"
diff --git a/victorialogs/load b/victorialogs/load
@@ -11,20 +11,10 @@ for _ in {1..300}; do
     sleep 1
 done
 
-# Use the pre-decompressed hits.json on the playground's readonly
-# dataset disk (no temp file, no 75 GB gunzip in the VM). Fall back
-# to wget + gunzip when running standalone.
-if [ -f /opt/clickbench/datasets_ro/hits.json ]; then
-    HITS=/opt/clickbench/datasets_ro/hits.json
-else
-    wget --continue --progress=dot:giga \
-        'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
-    gunzip -f hits.json.gz
-    HITS=hits.json
-fi
+# Bulk insert via 8 parallel jsonline streams. hits.json is delivered
+# by the shared lib/download-hits-json helper (symlink to the RO
+# dataset on the playground, wget + pigz elsewhere).
+split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" hits.json
 
-# Bulk insert via 8 parallel jsonline streams.
-split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" "$HITS"
-
-[ "$HITS" = "hits.json" ] && rm -f hits.json
+rm -f hits.json
 sync