chore: Add Java Flight Recorder profiling to TPC benchmarks (#3597)

andygrove · web-flow · commit 68b2c4d8f858 · 2026-02-25T17:40:24.000-07:00
diff --git a/benchmarks/tpc/README.md b/benchmarks/tpc/README.md
@@ -38,15 +38,17 @@ All benchmarks are run via `run.py`:
 python3 run.py --engine <engine> --benchmark <tpch|tpcds> [options]
 ```
 
-| Option         | Description                                      |
-| -------------- | ------------------------------------------------ |
-| `--engine`     | Engine name (matches a TOML file in `engines/`)  |
-| `--benchmark`  | `tpch` or `tpcds`                                |
-| `--iterations` | Number of iterations (default: 1)                |
-| `--output`     | Output directory (default: `.`)                  |
-| `--query`      | Run a single query number                        |
-| `--no-restart` | Skip Spark master/worker restart                 |
-| `--dry-run`    | Print the spark-submit command without executing |
+| Option         | Description                                              |
+| -------------- | -------------------------------------------------------- |
+| `--engine`     | Engine name (matches a TOML file in `engines/`)          |
+| `--benchmark`  | `tpch` or `tpcds`                                        |
+| `--iterations` | Number of iterations (default: 1)                        |
+| `--output`     | Output directory (default: `.`)                          |
+| `--query`      | Run a single query number                                |
+| `--no-restart` | Skip Spark master/worker restart                         |
+| `--dry-run`    | Print the spark-submit command without executing         |
+| `--jfr`        | Enable Java Flight Recorder profiling                    |
+| `--jfr-dir`    | Directory for JFR output files (default: `/results/jfr`) |
 
 Available engines: `spark`, `comet`, `comet-iceberg`, `gluten`
 
@@ -363,3 +365,30 @@ python3 generate-comparison.py --benchmark tpch \
     --title "TPC-H @ 100 GB: Parquet vs Iceberg" \
     comet-tpch-*.json comet-iceberg-tpch-*.json
 ```
+
+## Java Flight Recorder Profiling
+
+Use the `--jfr` flag to capture JFR profiles from the Spark driver and executors.
+JFR is built into JDK 11+ so no additional dependencies are needed.
+
+```shell
+python3 run.py --engine comet --benchmark tpch --jfr
+```
+
+JFR recordings are written to `/results/jfr/` by default (configurable with
+`--jfr-dir`). The driver writes `driver.jfr` and each executor writes
+`executor.jfr` (JFR appends the PID when multiple executors share a path).
+
+With Docker Compose, the `/results` volume is shared across all containers,
+so JFR files from both driver and executors are collected in
+`$RESULTS_DIR/jfr/` on the host:
+
+```shell
+docker compose -f benchmarks/tpc/infra/docker/docker-compose.yml \
+    run --rm bench \
+    python3 /opt/benchmarks/run.py \
+    --engine comet --benchmark tpch --output /results --no-restart --jfr
+```
+
+Open the `.jfr` files with [JDK Mission Control](https://jdk.java.net/jmc/),
+IntelliJ IDEA's profiler, or `jfr` CLI tool (`jfr summary driver.jfr`).
diff --git a/benchmarks/tpc/infra/docker/docker-compose-laptop.yml b/benchmarks/tpc/infra/docker/docker-compose-laptop.yml
@@ -72,6 +72,7 @@ services:
       - SPARK_NO_DAEMONIZE=true
     mem_limit: 8g
     memswap_limit: 8g
+    stop_grace_period: 30s
 
   bench:
     image: ${BENCH_IMAGE:-comet-bench}
diff --git a/benchmarks/tpc/infra/docker/docker-compose.yml b/benchmarks/tpc/infra/docker/docker-compose.yml
@@ -56,6 +56,7 @@ x-worker: &worker
     - SPARK_NO_DAEMONIZE=true
   mem_limit: ${WORKER_MEM_LIMIT:-32g}
   memswap_limit: ${WORKER_MEM_LIMIT:-32g}
+  stop_grace_period: 30s
 
 services:
   spark-master:
diff --git a/benchmarks/tpc/run.py b/benchmarks/tpc/run.py
@@ -261,6 +261,24 @@ def build_spark_submit_cmd(config, benchmark, args):
             val = "true" if val else "false"
         conf[resolve_env(key)] = resolve_env(str(val))
 
+    # JFR profiling: append to extraJavaOptions (preserving any existing values)
+    if args.jfr:
+        jfr_dir = args.jfr_dir
+        driver_jfr = (
+            f"-XX:StartFlightRecording=disk=true,dumponexit=true,"
+            f"filename={jfr_dir}/driver.jfr,settings=profile"
+        )
+        executor_jfr = (
+            f"-XX:StartFlightRecording=disk=true,dumponexit=true,"
+            f"filename={jfr_dir}/executor.jfr,settings=profile"
+        )
+        for spark_key, jfr_opts in [
+            ("spark.driver.extraJavaOptions", driver_jfr),
+            ("spark.executor.extraJavaOptions", executor_jfr),
+        ]:
+            existing = conf.get(spark_key, "")
+            conf[spark_key] = f"{existing} {jfr_opts}".strip()
+
     for key, val in sorted(conf.items()):
         cmd += ["--conf", f"{key}={val}"]
 
@@ -357,6 +375,16 @@ def main():
         action="store_true",
         help="Print the spark-submit command without executing",
     )
+    parser.add_argument(
+        "--jfr",
+        action="store_true",
+        help="Enable Java Flight Recorder profiling for driver and executors",
+    )
+    parser.add_argument(
+        "--jfr-dir",
+        default="/results/jfr",
+        help="Directory for JFR output files (default: /results/jfr)",
+    )
     args = parser.parse_args()
 
     config = load_engine_config(args.engine)
@@ -373,6 +401,10 @@ def main():
     if not args.no_restart and not args.dry_run:
         restart_spark()
 
+    # Create JFR output directory if profiling is enabled
+    if args.jfr:
+        os.makedirs(args.jfr_dir, exist_ok=True)
+
     cmd = build_spark_submit_cmd(config, args.benchmark, args)
 
     if args.dry_run: