apache
diff --git a/‎.github/dependabot.yml‎
Lines changed: 16 additions & 0 deletions b/‎.github/dependabot.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎.github/workflows/stale.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/stale.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/benchmarks/.gitignore‎ ‎benchmarks/tpc/.gitignore‎dev/benchmarks/.gitignore renamed to benchmarks/tpc/.gitignore b/‎dev/benchmarks/.gitignore‎ ‎benchmarks/tpc/.gitignore‎dev/benchmarks/.gitignore renamed to benchmarks/tpc/.gitignore
diff --git a/‎dev/benchmarks/README.md‎ ‎benchmarks/tpc/README.md‎dev/benchmarks/README.md renamed to benchmarks/tpc/README.md
Lines changed: 65 additions & 19 deletions b/‎dev/benchmarks/README.md‎ ‎benchmarks/tpc/README.md‎dev/benchmarks/README.md renamed to benchmarks/tpc/README.md
Lines changed: 65 additions & 19 deletions
diff --git a/‎benchmarks/tpc/create-iceberg-tables.py‎
Lines changed: 171 additions & 0 deletions b/‎benchmarks/tpc/create-iceberg-tables.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎dev/benchmarks/drop-caches.sh‎ ‎benchmarks/tpc/drop-caches.sh‎dev/benchmarks/drop-caches.sh renamed to benchmarks/tpc/drop-caches.sh b/‎dev/benchmarks/drop-caches.sh‎ ‎benchmarks/tpc/drop-caches.sh‎dev/benchmarks/drop-caches.sh renamed to benchmarks/tpc/drop-caches.sh
@@ -37,6 +37,22 @@ updates:
         patterns:
           - "prost*"
           - "pbjson*"
+      # Catch-all: group only minor/patch into a single PR,
+      # excluding deps we want always separate (and excluding arrow/parquet which have their own group)
+      all-other-cargo-deps:
+        applies-to: version-updates
+        patterns:
+          - "*"
+        exclude-patterns:
+          - "arrow*"
+          - "parquet"
+          - "object_store"
+          - "sqlparser"
+          - "prost*"
+          - "pbjson*"
+        update-types:
+          - "minor"
+          - "patch"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
 
@@ -27,7 +27,7 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d  # v10.1.1
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f  # v10.2.0
         with:
           stale-pr-message: "Thank you for your contribution. Unfortunately, this pull request is stale because it has been open 60 days with no activity. Please remove the stale label or comment or this will be closed in 7 days."
           days-before-pr-stale: 60
 
@@ -26,6 +26,26 @@ For full instructions on running these benchmarks on an EC2 instance, see the [C
 
 [Comet Benchmarking on EC2 Guide]: https://datafusion.apache.org/comet/contributor-guide/benchmarking_aws_ec2.html
 
+## Usage
+
+All benchmarks are run via `run.py`:
+
+```
+python3 run.py --engine <engine> --benchmark <tpch|tpcds> [options]
+```
+
+| Option         | Description                                      |
+| -------------- | ------------------------------------------------ |
+| `--engine`     | Engine name (matches a TOML file in `engines/`)  |
+| `--benchmark`  | `tpch` or `tpcds`                                |
+| `--iterations` | Number of iterations (default: 1)                |
+| `--output`     | Output directory (default: `.`)                  |
+| `--query`      | Run a single query number                        |
+| `--no-restart` | Skip Spark master/worker restart                 |
+| `--dry-run`    | Print the spark-submit command without executing |
+
+Available engines: `spark`, `comet`, `comet-iceberg`, `gluten`
+
 ## Example usage
 
 Set Spark environment variables:
@@ -47,7 +67,7 @@ Run Spark benchmark:
 ```shell
 export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
 sudo ./drop-caches.sh
-./spark-tpch.sh
+python3 run.py --engine spark --benchmark tpch
 ```
 
 Run Comet benchmark:
@@ -56,7 +76,7 @@ Run Comet benchmark:
 export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
 export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar
 sudo ./drop-caches.sh
-./comet-tpch.sh
+python3 run.py --engine comet --benchmark tpch
 ```
 
 Run Gluten benchmark:
@@ -65,7 +85,13 @@ Run Gluten benchmark:
 export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
 export GLUTEN_JAR=/opt/gluten/gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar
 sudo ./drop-caches.sh
-./gluten-tpch.sh
+python3 run.py --engine gluten --benchmark tpch
+```
+
+Preview a command without running it:
+
+```shell
+python3 run.py --engine comet --benchmark tpch --dry-run
 ```
 
 Generating charts:
@@ -74,6 +100,11 @@ Generating charts:
 python3 generate-comparison.py --benchmark tpch --labels "Spark 3.5.3" "Comet 0.9.0" "Gluten 1.4.0" --title "TPC-H @ 100 GB (single executor, 8 cores, local Parquet files)" spark-tpch-1752338506381.json comet-tpch-1752337818039.json gluten-tpch-1752337474344.json
 ```
 
+## Engine Configuration
+
+Each engine is defined by a TOML file in `engines/`. The config specifies JARs, Spark conf overrides,
+required environment variables, and optional defaults/exports. See existing files for examples.
+
 ## Iceberg Benchmarking
 
 Comet includes native Iceberg support via iceberg-rust integration. This enables benchmarking TPC-H queries
@@ -90,14 +121,16 @@ export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
 
 Note: Table creation uses `--packages` which auto-downloads the dependency.
 
-### Create Iceberg TPC-H tables
+### Create Iceberg tables
 
-Convert existing Parquet TPC-H data to Iceberg format:
+Convert existing Parquet data to Iceberg format using `create-iceberg-tables.py`.
+The script configures the Iceberg catalog automatically -- no `--conf` flags needed.
 
 ```shell
 export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
-export ICEBERG_CATALOG=${ICEBERG_CATALOG:-local}
+mkdir -p $ICEBERG_WAREHOUSE
 
+# TPC-H
 $SPARK_HOME/bin/spark-submit \
     --master $SPARK_MASTER \
     --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
@@ -106,13 +139,24 @@ $SPARK_HOME/bin/spark-submit \
     --conf spark.executor.cores=8 \
     --conf spark.cores.max=8 \
     --conf spark.executor.memory=16g \
-    --conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \
-    --conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \
-    --conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \
-    create-iceberg-tpch.py \
+    create-iceberg-tables.py \
+    --benchmark tpch \
     --parquet-path $TPCH_DATA \
-    --catalog $ICEBERG_CATALOG \
-    --database tpch
+    --warehouse $ICEBERG_WAREHOUSE
+
+# TPC-DS
+$SPARK_HOME/bin/spark-submit \
+    --master $SPARK_MASTER \
+    --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
+    --conf spark.driver.memory=8G \
+    --conf spark.executor.instances=2 \
+    --conf spark.executor.cores=8 \
+    --conf spark.cores.max=16 \
+    --conf spark.executor.memory=16g \
+    create-iceberg-tables.py \
+    --benchmark tpcds \
+    --parquet-path $TPCDS_DATA \
+    --warehouse $ICEBERG_WAREHOUSE
 ```
 
 ### Run Iceberg benchmark
@@ -124,20 +168,22 @@ export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
 export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
 export TPCH_QUERIES=/mnt/bigdata/tpch/queries/
 sudo ./drop-caches.sh
-./comet-tpch-iceberg.sh
+python3 run.py --engine comet-iceberg --benchmark tpch
 ```
 
 The benchmark uses `spark.comet.scan.icebergNative.enabled=true` to enable Comet's native iceberg-rust
 integration. Verify native scanning is active by checking for `CometIcebergNativeScanExec` in the
 physical plan output.
 
-### Iceberg-specific options
+### create-iceberg-tables.py options
 
-| Environment Variable | Default    | Description                         |
-| -------------------- | ---------- | ----------------------------------- |
-| `ICEBERG_CATALOG`    | `local`    | Iceberg catalog name                |
-| `ICEBERG_DATABASE`   | `tpch`     | Database containing TPC-H tables    |
-| `ICEBERG_WAREHOUSE`  | (required) | Path to Iceberg warehouse directory |
+| Option           | Required | Default        | Description                         |
+| ---------------- | -------- | -------------- | ----------------------------------- |
+| `--benchmark`    | Yes      |                | `tpch` or `tpcds`                   |
+| `--parquet-path` | Yes      |                | Path to source Parquet data         |
+| `--warehouse`    | Yes      |                | Path to Iceberg warehouse directory |
+| `--catalog`      | No       | `local`        | Iceberg catalog name                |
+| `--database`     | No       | benchmark name | Database name for the tables        |
 
 ### Comparing Parquet vs Iceberg performance
 
 
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Convert TPC-H or TPC-DS Parquet data to Iceberg tables.
+
+Usage:
+    spark-submit \
+        --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
+        create-iceberg-tables.py \
+        --benchmark tpch \
+        --parquet-path /path/to/tpch/parquet \
+        --warehouse /path/to/iceberg-warehouse
+
+    spark-submit \
+        --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
+        create-iceberg-tables.py \
+        --benchmark tpcds \
+        --parquet-path /path/to/tpcds/parquet \
+        --warehouse /path/to/iceberg-warehouse
+"""
+
+import argparse
+import os
+import sys
+from pyspark.sql import SparkSession
+import time
+
+TPCH_TABLES = [
+    "customer",
+    "lineitem",
+    "nation",
+    "orders",
+    "part",
+    "partsupp",
+    "region",
+    "supplier",
+]
+
+TPCDS_TABLES = [
+    "call_center",
+    "catalog_page",
+    "catalog_returns",
+    "catalog_sales",
+    "customer",
+    "customer_address",
+    "customer_demographics",
+    "date_dim",
+    "time_dim",
+    "household_demographics",
+    "income_band",
+    "inventory",
+    "item",
+    "promotion",
+    "reason",
+    "ship_mode",
+    "store",
+    "store_returns",
+    "store_sales",
+    "warehouse",
+    "web_page",
+    "web_returns",
+    "web_sales",
+    "web_site",
+]
+
+BENCHMARK_TABLES = {
+    "tpch": TPCH_TABLES,
+    "tpcds": TPCDS_TABLES,
+}
+
+
+def main(benchmark: str, parquet_path: str, warehouse: str, catalog: str, database: str):
+    table_names = BENCHMARK_TABLES[benchmark]
+
+    # Validate paths before starting Spark
+    errors = []
+    if not os.path.isdir(parquet_path):
+        errors.append(f"Error: --parquet-path '{parquet_path}' does not exist or is not a directory")
+    if not os.path.isdir(warehouse):
+        errors.append(f"Error: --warehouse '{warehouse}' does not exist or is not a directory. "
+                       "Create it with: mkdir -p " + warehouse)
+    if errors:
+        for e in errors:
+            print(e, file=sys.stderr)
+        sys.exit(1)
+
+    spark = SparkSession.builder \
+        .appName(f"Create Iceberg {benchmark.upper()} Tables") \
+        .config(f"spark.sql.catalog.{catalog}", "org.apache.iceberg.spark.SparkCatalog") \
+        .config(f"spark.sql.catalog.{catalog}.type", "hadoop") \
+        .config(f"spark.sql.catalog.{catalog}.warehouse", warehouse) \
+        .getOrCreate()
+
+    # Set the Iceberg catalog as the current catalog so that
+    # namespace operations are routed correctly
+    spark.sql(f"USE {catalog}")
+
+    # Create namespace if it doesn't exist
+    try:
+        spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {database}")
+    except Exception:
+        # Namespace may already exist
+        pass
+
+    for table in table_names:
+        parquet_table_path = f"{parquet_path}/{table}.parquet"
+        iceberg_table = f"{catalog}.{database}.{table}"
+
+        print(f"Converting {parquet_table_path} -> {iceberg_table}")
+        start_time = time.time()
+
+        # Drop table if exists to allow re-running
+        spark.sql(f"DROP TABLE IF EXISTS {iceberg_table}")
+
+        # Read parquet and write as Iceberg
+        df = spark.read.parquet(parquet_table_path)
+        df.writeTo(iceberg_table).using("iceberg").create()
+
+        row_count = spark.table(iceberg_table).count()
+        elapsed = time.time() - start_time
+        print(f"  Created {iceberg_table} with {row_count} rows in {elapsed:.2f}s")
+
+    print(f"\nAll {benchmark.upper()} tables created successfully!")
+    print(f"Tables available at: {catalog}.{database}.*")
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert TPC-H or TPC-DS Parquet data to Iceberg tables"
+    )
+    parser.add_argument(
+        "--benchmark", required=True, choices=["tpch", "tpcds"],
+        help="Benchmark whose tables to convert (tpch or tpcds)"
+    )
+    parser.add_argument(
+        "--parquet-path", required=True,
+        help="Path to Parquet data directory"
+    )
+    parser.add_argument(
+        "--warehouse", required=True,
+        help="Path to Iceberg warehouse directory"
+    )
+    parser.add_argument(
+        "--catalog", default="local",
+        help="Iceberg catalog name (default: 'local')"
+    )
+    parser.add_argument(
+        "--database", default=None,
+        help="Database name to create tables in (defaults to benchmark name)"
+    )
+    args = parser.parse_args()
+
+    database = args.database if args.database else args.benchmark
+    main(args.benchmark, args.parquet_path, args.warehouse, args.catalog, database)