Skip to content

Commit 26a72f1

Browse files
authored
Merge branch 'main' into df_spark_floor
2 parents 711ae23 + fa9ada3 commit 26a72f1

24 files changed

Lines changed: 1098 additions & 207 deletions

File tree

.github/workflows/breaking_changes_detector.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,22 @@ jobs:
5959
with:
6060
fetch-depth: 0
6161

62-
# For fork PRs, `origin` points to the fork, not the upstream repo.
63-
# Explicitly fetch the base branch from the upstream repo so we have
64-
# a valid baseline ref for both diff and semver-checks.
62+
# `origin` may point at a fork (when a contributor runs this locally) or
63+
# at a stale ref. Fetch the base branch from the PR's upstream repo into
64+
# a dedicated `apache/<base>` ref so the baseline is unambiguous and the
65+
# same ref name works locally (`git remote add apache ...`) and in CI.
6566
- name: Fetch base branch
6667
env:
6768
BASE_REF: ${{ github.base_ref }}
6869
REPO: ${{ github.repository }}
69-
run: git fetch "https://github.com/${REPO}.git" "${BASE_REF}:refs/remotes/origin/${BASE_REF}"
70+
run: git fetch "https://github.com/${REPO}.git" "${BASE_REF}:refs/remotes/apache/${BASE_REF}"
7071

7172
- name: Determine changed crates
7273
id: changed_crates
7374
env:
7475
BASE_REF: ${{ github.base_ref }}
7576
run: |
76-
PACKAGES=$(ci/scripts/changed_crates.sh changed-crates "origin/${BASE_REF}")
77+
PACKAGES=$(ci/scripts/changed_crates.sh changed-crates "apache/${BASE_REF}")
7778
echo "packages=$PACKAGES" >> "$GITHUB_OUTPUT"
7879
echo "Changed crates: $PACKAGES"
7980
@@ -102,7 +103,8 @@ jobs:
102103
set +e
103104
# `tee` lets cargo's output stream live into the Actions log
104105
# while we also keep a copy for the PR comment.
105-
ci/scripts/changed_crates.sh semver-check "origin/${BASE_REF}" $PACKAGES \
106+
# Using `apache` remote here to point to the repository the pull request is against
107+
ci/scripts/changed_crates.sh semver-check "apache/${BASE_REF}" $PACKAGES \
106108
2>&1 | tee /tmp/semver-output.txt
107109
EXIT_CODE=${PIPESTATUS[0]}
108110
# Pass the result through an output instead of failing the job:

.github/workflows/breaking_changes_detector_comment.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ on:
5050
permissions:
5151
contents: read
5252

53+
# A dedicated label, separate from the existing `api change` label.
54+
# `api change` may be applied manually for behavioral changes that aren't
55+
# strictly API changes, so we can't safely auto-remove it when this check
56+
# passes. This auto-managed label is fully owned by the workflow.
57+
env:
58+
BREAKING_CHANGE_LABEL: "auto detected api change"
59+
5360
jobs:
5461
comment-on-pr:
5562
name: Comment on pull request
@@ -130,3 +137,24 @@ jobs:
130137
number: ${{ steps.read.outputs.pr_number }}
131138
body-include: '<!-- semver-check-comment -->'
132139
delete: true
140+
141+
- name: Add "auto detected api change" label
142+
if: steps.read.outputs.result != 'success'
143+
env:
144+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
145+
REPO: ${{ github.repository }}
146+
PR_NUMBER: ${{ steps.read.outputs.pr_number }}
147+
run: |
148+
gh pr edit "$PR_NUMBER" --repo "$REPO" \
149+
--add-label "$BREAKING_CHANGE_LABEL"
150+
151+
- name: Remove "auto detected api change" label
152+
if: steps.read.outputs.result == 'success'
153+
env:
154+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
155+
REPO: ${{ github.repository }}
156+
PR_NUMBER: ${{ steps.read.outputs.pr_number }}
157+
run: |
158+
# No-op when the label isn't currently applied.
159+
gh pr edit "$PR_NUMBER" --repo "$REPO" \
160+
--remove-label "$BREAKING_CHANGE_LABEL" || true

benchmarks/bench.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,7 @@ run_tpch() {
691691

692692
debug_run env BENCH_NAME=tpch \
693693
BENCH_SIZE="${SCALE_FACTOR}" \
694+
DATA_DIR="${DATA_DIR}" \
694695
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
695696
TPCH_FILE_TYPE="${FORMAT}" \
696697
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
@@ -709,6 +710,7 @@ run_tpch_mem() {
709710

710711
debug_run env BENCH_NAME=tpch \
711712
BENCH_SIZE="${SCALE_FACTOR}" \
713+
DATA_DIR="${DATA_DIR}" \
712714
TPCH_FILE_TYPE="mem" \
713715
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
714716
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \

benchmarks/sql_benchmarks/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ The SQL benchmarking tool uses the following environment variables:
6868
| BENCH_QUERY | A query number to run. |
6969
| BENCH_PERSIST_RESULTS | true/false to persist benchmark results. Results will be persisted in csv format so be cognizant of the size of the results. |
7070
| BENCH_VALIDATE | true/false to validate benchmark results against persisted results or result_query's. If both `BENCH_PERSIST_RESULTS` and `BENCH_VALIDATE` are true, persist mode runs and validation is skipped. |
71+
| DATA_DIR | Root directory for benchmark data loaded by SQL benchmark files. When unset, uses `data` (relative to the benchmarks/ directory). |
7172
| SIMULATE_LATENCY | Simulate object store latency to mimic remote storage (e.g. S3). Adds random latency in the range 20-200ms to each object store operation. |
7273
| MEM_POOL_TYPE | The memory pool type to use, should be one of "fair" or "greedy". |
73-
| MEMORY_LIMIT | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit. | |
74+
| MEMORY_LIMIT | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit. |
7475

7576
Example – Run the H2O window benchmarks on the 'small' sized CSV data files:
7677

benchmarks/sql_benchmarks/tpch/init/load_csv.sql

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@ CREATE EXTERNAL TABLE nation
55
n_regionkey INT,
66
n_comment VARCHAR(152),
77
PRIMARY KEY (n_nationkey)
8-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
8+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
99

1010
CREATE EXTERNAL TABLE region
1111
(
1212
r_regionkey INT,
1313
r_name CHAR(25),
1414
r_comment VARCHAR(152),
1515
PRIMARY KEY (r_regionkey)
16-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
16+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
1717

1818
CREATE EXTERNAL TABLE supplier
1919
(
@@ -25,7 +25,7 @@ CREATE EXTERNAL TABLE supplier
2525
s_acctbal DECIMAL(15, 2),
2626
s_comment VARCHAR(101),
2727
PRIMARY KEY (s_suppkey)
28-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
28+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
2929

3030
CREATE EXTERNAL TABLE customer
3131
(
@@ -38,7 +38,7 @@ CREATE EXTERNAL TABLE customer
3838
c_mktsegment CHAR(10),
3939
c_comment VARCHAR(117),
4040
PRIMARY KEY (c_custkey)
41-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
41+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
4242

4343
CREATE EXTERNAL TABLE part
4444
(
@@ -52,7 +52,7 @@ CREATE EXTERNAL TABLE part
5252
p_retailprice DECIMAL(15, 2),
5353
p_comment VARCHAR(23),
5454
PRIMARY KEY (p_partkey)
55-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
55+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
5656

5757
CREATE EXTERNAL TABLE partsupp
5858
(
@@ -62,7 +62,7 @@ CREATE EXTERNAL TABLE partsupp
6262
ps_supplycost DECIMAL(15, 2),
6363
ps_comment VARCHAR(199),
6464
PRIMARY KEY (ps_partkey)
65-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
65+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
6666

6767
CREATE EXTERNAL TABLE orders
6868
(
@@ -76,7 +76,7 @@ CREATE EXTERNAL TABLE orders
7676
o_shippriority INT,
7777
o_comment VARCHAR(79),
7878
PRIMARY KEY (o_orderkey)
79-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';
79+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';
8080

8181
CREATE EXTERNAL TABLE lineitem
8282
(
@@ -96,4 +96,4 @@ CREATE EXTERNAL TABLE lineitem
9696
l_shipinstruct CHAR(25),
9797
l_shipmode CHAR(10),
9898
l_comment VARCHAR(44)
99-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';
99+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';

benchmarks/sql_benchmarks/tpch/init/load_mem.sql

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
1+
CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
22

3-
CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
3+
CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
44

5-
CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
5+
CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
66

7-
CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
7+
CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
88

9-
CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
9+
CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
1010

11-
CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
11+
CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
1212

13-
CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
13+
CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
1414

15-
CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
15+
CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
1616

1717
CREATE TABLE nation as SELECT * FROM nation_raw;
1818

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
1+
CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
22

3-
CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
3+
CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
44

5-
CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
5+
CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
66

7-
CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
7+
CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
88

9-
CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
9+
CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
1010

11-
CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
11+
CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
1212

13-
CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
13+
CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
1414

15-
CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
15+
CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';

benchmarks/src/sql_benchmark.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,6 +2230,74 @@ NULL|(empty)
22302230
);
22312231
}
22322232

2233+
#[tokio::test]
2234+
async fn parser_applies_data_dir_replacement_in_load_query_file() {
2235+
let temp_dir = tempdir().expect("failed to create benchmark test directory");
2236+
let data_dir = temp_dir.path().join("non_default_data");
2237+
let csv_dir = data_dir.join("tpch_sf1/csv/generated");
2238+
fs::create_dir_all(&csv_dir).expect("failed to create generated data directory");
2239+
fs::write(csv_dir.join("generated.1.csv"), "value\n42\n")
2240+
.expect("failed to write generated csv file");
2241+
2242+
let load_path = write_test_file(
2243+
&temp_dir,
2244+
"load_generated_csv.sql",
2245+
"CREATE EXTERNAL TABLE generated(value INT) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/generated/generated.1.csv' OPTIONS ('format.has_header' 'true');\n",
2246+
);
2247+
let template_path = write_test_file(
2248+
&temp_dir,
2249+
"load_file_template.benchmark",
2250+
&format!(
2251+
"load {}\n\nrun\nSELECT value FROM generated;\n",
2252+
load_path.display()
2253+
),
2254+
);
2255+
let benchmark_path = write_test_file(
2256+
&temp_dir,
2257+
"load_file_driver.benchmark",
2258+
&format!(
2259+
"template {}\nDATA_DIR={}\n",
2260+
template_path.display(),
2261+
data_dir.display()
2262+
),
2263+
);
2264+
2265+
let ctx = SessionContext::new();
2266+
let path_string = benchmark_path.to_string_lossy().into_owned();
2267+
let mut benchmark = SqlBenchmark::new(&ctx, &path_string, "/tmp")
2268+
.await
2269+
.expect("benchmark should parse");
2270+
2271+
let load_queries = benchmark
2272+
.queries()
2273+
.get(&QueryDirective::Load)
2274+
.expect("load queries");
2275+
assert_eq!(load_queries.len(), 1);
2276+
assert!(
2277+
load_queries.iter().all(|query| !query.contains("${")),
2278+
"all placeholders should be replaced: {load_queries:?}"
2279+
);
2280+
let expected_location = format!(
2281+
"LOCATION '{}/tpch_sf1/csv/generated/generated.1.csv'",
2282+
data_dir.display()
2283+
);
2284+
assert!(
2285+
load_queries[0].contains(&expected_location),
2286+
"all load locations should use the non-default DATA_DIR: {load_queries:?}"
2287+
);
2288+
2289+
benchmark
2290+
.initialize(&ctx)
2291+
.await
2292+
.expect("benchmark should load generated csv file");
2293+
benchmark
2294+
.run(&ctx, true)
2295+
.await
2296+
.expect("benchmark should read generated csv file");
2297+
2298+
assert_eq!(formatted_last_results(&benchmark), vec![vec!["42"]]);
2299+
}
2300+
22332301
#[tokio::test]
22342302
async fn parser_rejects_inline_sql_when_query_file_is_provided() {
22352303
let temp_dir = tempdir().expect("failed to create benchmark test directory");

0 commit comments

Comments
 (0)