LuciferYang
diff --git a/‎benchmark/scripts/run-benchmark.sh‎
Lines changed: 3 additions & 0 deletions b/‎benchmark/scripts/run-benchmark.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmark/src/main/java/org/lance/spark/benchmark/BenchmarkReporter.java‎
Lines changed: 123 additions & 0 deletions b/‎benchmark/src/main/java/org/lance/spark/benchmark/BenchmarkReporter.java‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎benchmark/src/main/java/org/lance/spark/benchmark/BenchmarkResult.java‎
Lines changed: 48 additions & 5 deletions b/‎benchmark/src/main/java/org/lance/spark/benchmark/BenchmarkResult.java‎
Lines changed: 48 additions & 5 deletions
diff --git a/‎benchmark/src/main/java/org/lance/spark/benchmark/DfpClusterRebuilder.java‎
Lines changed: 105 additions & 0 deletions b/‎benchmark/src/main/java/org/lance/spark/benchmark/DfpClusterRebuilder.java‎
Lines changed: 105 additions & 0 deletions
@@ -85,6 +85,9 @@ fi
 if [ -n "${QUERIES:-}" ]; then
   BENCHMARK_EXTRA_ARGS="${BENCHMARK_EXTRA_ARGS} --queries ${QUERIES}"
 fi
+if [ -n "${DFP_MODE:-}" ]; then
+  BENCHMARK_EXTRA_ARGS="${BENCHMARK_EXTRA_ARGS} --dfp-mode ${DFP_MODE}"
+fi
 
 ${SPARK_SUBMIT} \
   --class org.lance.spark.benchmark.TpcdsBenchmarkRunner \
 
@@ -183,6 +183,129 @@ public void printSummary() {
     }
 
     System.out.printf("Queries passed: %d, partial/failed: %d%n", passCount, failCount);
+
+    printDfpComparison();
+  }
+
+  /**
+   * Emit a separate DFP on-vs-off table when the input contains both {@code dfp_mode="on"} and
+   * {@code dfp_mode="off"} rows for the same query+format. Silent no-op otherwise so the regular
+   * Parquet-vs-Lance run (no DFP sweep) doesn't grow an irrelevant extra section.
+   */
+  private void printDfpComparison() {
+    // Collect per-query medians split by dfpMode, restricted to the lance format since only
+    // lance honors the flag. A row is comparable only when both "on" and "off" measurements
+    // exist.
+    Map<String, Long> medianOn = new LinkedHashMap<>();
+    Map<String, Long> medianOff = new LinkedHashMap<>();
+    Map<String, Long> fragmentsOn = new LinkedHashMap<>();
+    Map<String, Long> fragmentsOff = new LinkedHashMap<>();
+
+    Map<String, List<Long>> timesOn = new LinkedHashMap<>();
+    Map<String, List<Long>> timesOff = new LinkedHashMap<>();
+
+    for (BenchmarkResult r : results) {
+      if (!r.isSuccess() || !"lance".equals(r.getFormat())) {
+        continue;
+      }
+      Map<String, List<Long>> bucket = null;
+      if (BenchmarkResult.DFP_ON.equals(r.getDfpMode())) {
+        bucket = timesOn;
+      } else if (BenchmarkResult.DFP_OFF.equals(r.getDfpMode())) {
+        bucket = timesOff;
+      }
+      if (bucket != null) {
+        bucket.computeIfAbsent(r.getQueryName(), k -> new ArrayList<>()).add(r.getElapsedMs());
+        // Record the first observed fragmentsScanned per query+mode. The metric is plan-level
+        // and identical across iterations of the same query, so there's no need to median it.
+        QueryMetrics qm = r.getMetrics();
+        if (qm != null && qm.getLanceFragmentsScanned() >= 0) {
+          Map<String, Long> fragments =
+              BenchmarkResult.DFP_ON.equals(r.getDfpMode()) ? fragmentsOn : fragmentsOff;
+          fragments.putIfAbsent(r.getQueryName(), qm.getLanceFragmentsScanned());
+        }
+      }
+    }
+
+    for (Map.Entry<String, List<Long>> e : timesOn.entrySet()) {
+      List<Long> t = e.getValue();
+      t.sort(Long::compareTo);
+      medianOn.put(e.getKey(), t.get(t.size() / 2));
+    }
+    for (Map.Entry<String, List<Long>> e : timesOff.entrySet()) {
+      List<Long> t = e.getValue();
+      t.sort(Long::compareTo);
+      medianOff.put(e.getKey(), t.get(t.size() / 2));
+    }
+
+    // Only emit the section when we have paired on/off measurements on at least one query.
+    boolean hasPair = false;
+    for (String q : medianOn.keySet()) {
+      if (medianOff.containsKey(q)) {
+        hasPair = true;
+        break;
+      }
+    }
+    if (!hasPair) {
+      return;
+    }
+
+    System.out.println();
+    System.out.println("=== DFP On-vs-Off Comparison (Lance only) ===");
+    System.out.println();
+    System.out.printf(
+        "%-8s %10s %10s %8s %10s %10s %8s%n",
+        "Query", "OFF(ms)", "ON(ms)", "Speedup", "Frags OFF", "Frags ON", "Pruned%");
+    System.out.println("-".repeat(70));
+
+    List<Double> speedups = new ArrayList<>();
+    List<Double> prunePcts = new ArrayList<>();
+    int firedCount = 0;
+
+    for (String q : medianOn.keySet()) {
+      if (!medianOff.containsKey(q)) {
+        continue;
+      }
+      long on = medianOn.get(q);
+      long off = medianOff.get(q);
+      double speedup = off > 0 ? (double) off / on : 0.0;
+      speedups.add(speedup);
+
+      Long fOn = fragmentsOn.get(q);
+      Long fOff = fragmentsOff.get(q);
+      String fOnStr = fOn == null || fOn < 0 ? "-" : String.valueOf(fOn);
+      String fOffStr = fOff == null || fOff < 0 ? "-" : String.valueOf(fOff);
+      String prunedPctStr = "-";
+      if (fOn != null && fOff != null && fOff > 0 && fOn >= 0) {
+        double pct = 100.0 * (fOff - fOn) / fOff;
+        prunedPctStr = String.format("%.1f%%", pct);
+        prunePcts.add(pct);
+        if (fOn < fOff) {
+          firedCount++;
+        }
+      }
+
+      System.out.printf(
+          "%-8s %10d %10d %8.2fx %10s %10s %8s%n",
+          q, off, on, speedup, fOffStr, fOnStr, prunedPctStr);
+    }
+
+    System.out.println();
+    if (!speedups.isEmpty()) {
+      double logSum = 0;
+      for (double s : speedups) {
+        logSum += Math.log(s);
+      }
+      System.out.printf(
+          "Geometric mean speedup (OFF/ON): %.2fx across %d queries%n",
+          Math.exp(logSum / speedups.size()), speedups.size());
+    }
+    if (!prunePcts.isEmpty()) {
+      double avg = prunePcts.stream().mapToDouble(Double::doubleValue).average().orElse(0.0);
+      System.out.printf(
+          "DFP fired on %d / %d queries; mean fragment reduction: %.1f%%%n",
+          firedCount, prunePcts.size(), avg);
+    }
   }
 
   private QueryMetrics findMetricsForQuery(String queryName) {
 
@@ -15,13 +15,24 @@
 
 public class BenchmarkResult {
 
+  /**
+   * DFP toggle state during this run. For Lance: {@code "on"} or {@code "off"}. For non-Lance
+   * scans the flag has no effect, so we record {@code "n/a"} to make cross-format joins on the
+   * CSV unambiguous.
+   */
+  public static final String DFP_ON = "on";
+
+  public static final String DFP_OFF = "off";
+  public static final String DFP_NA = "n/a";
+
   private final String queryName;
   private final String format;
   private final int iteration;
   private final long elapsedMs;
   private final boolean success;
   private final String errorMessage;
   private final QueryMetrics metrics;
+  private final String dfpMode;
 
   private BenchmarkResult(
       String queryName,
@@ -30,29 +41,56 @@ private BenchmarkResult(
       long elapsedMs,
       boolean success,
       String errorMessage,
-      QueryMetrics metrics) {
+      QueryMetrics metrics,
+      String dfpMode) {
     this.queryName = queryName;
     this.format = format;
     this.iteration = iteration;
     this.elapsedMs = elapsedMs;
     this.success = success;
     this.errorMessage = errorMessage;
     this.metrics = metrics;
+    this.dfpMode = dfpMode == null ? DFP_NA : dfpMode;
   }
 
   public static BenchmarkResult success(
       String queryName, String format, int iteration, long elapsedMs) {
-    return new BenchmarkResult(queryName, format, iteration, elapsedMs, true, null, null);
+    return new BenchmarkResult(
+        queryName, format, iteration, elapsedMs, true, null, null, DFP_NA);
   }
 
   public static BenchmarkResult success(
       String queryName, String format, int iteration, long elapsedMs, QueryMetrics metrics) {
-    return new BenchmarkResult(queryName, format, iteration, elapsedMs, true, null, metrics);
+    return new BenchmarkResult(
+        queryName, format, iteration, elapsedMs, true, null, metrics, DFP_NA);
+  }
+
+  public static BenchmarkResult success(
+      String queryName,
+      String format,
+      int iteration,
+      long elapsedMs,
+      QueryMetrics metrics,
+      String dfpMode) {
+    return new BenchmarkResult(
+        queryName, format, iteration, elapsedMs, true, null, metrics, dfpMode);
   }
 
   public static BenchmarkResult failure(
       String queryName, String format, int iteration, long elapsedMs, String errorMessage) {
-    return new BenchmarkResult(queryName, format, iteration, elapsedMs, false, errorMessage, null);
+    return new BenchmarkResult(
+        queryName, format, iteration, elapsedMs, false, errorMessage, null, DFP_NA);
+  }
+
+  public static BenchmarkResult failure(
+      String queryName,
+      String format,
+      int iteration,
+      long elapsedMs,
+      String errorMessage,
+      String dfpMode) {
+    return new BenchmarkResult(
+        queryName, format, iteration, elapsedMs, false, errorMessage, null, dfpMode);
   }
 
   public String getQueryName() {
@@ -83,12 +121,17 @@ public QueryMetrics getMetrics() {
     return metrics;
   }
 
+  public String getDfpMode() {
+    return dfpMode;
+  }
+
   public String toCsvLine() {
     String base =
         String.join(
             ",",
             queryName,
             format,
+            dfpMode,
             String.valueOf(iteration),
             String.valueOf(elapsedMs),
             String.valueOf(success),
@@ -100,7 +143,7 @@ public String toCsvLine() {
   }
 
   public static String csvHeader() {
-    return "query,format,iteration,elapsed_ms,success,error";
+    return "query,format,dfp_mode,iteration,elapsed_ms,success,error";
   }
 
   public static String csvHeaderWithMetrics() {
 
@@ -0,0 +1,105 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.spark.benchmark;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * Re-sorts an existing Lance table by a given column so downstream zonemap indexes produce
+ * tight per-fragment min/max bounds. A DFP run against an unclustered fact table has nothing
+ * to prune because every fragment's zone spans the full join-key domain.
+ *
+ * <p>The rebuild is done out-of-place: the source table is read, sorted, and written to
+ * {@code <table>_clustered.lance}, which the caller then swaps in (e.g. via {@code mv} in the
+ * orchestrating shell script). Destination-in-place rewriting would require a delete-&-rewrite
+ * inside Lance and is out of scope here.
+ *
+ * <p>Usage:
+ * <pre>
+ *   spark-submit --class org.lance.spark.benchmark.DfpClusterRebuilder benchmark.jar \
+ *       --src  /path/to/tpcds/lance/store_sales.lance \
+ *       --dst  /path/to/tpcds/lance/store_sales_clustered.lance \
+ *       --sort-by ss_sold_date_sk
+ * </pre>
+ */
+public class DfpClusterRebuilder {
+
+  public static void main(String[] args) {
+    String src = null;
+    String dst = null;
+    String sortBy = null;
+
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "--src":
+          src = args[++i];
+          break;
+        case "--dst":
+          dst = args[++i];
+          break;
+        case "--sort-by":
+          sortBy = args[++i];
+          break;
+        default:
+          System.err.println("Unknown argument: " + args[i]);
+          printUsage();
+          System.exit(1);
+      }
+    }
+
+    if (src == null || dst == null || sortBy == null) {
+      System.err.println("Missing required arguments.");
+      printUsage();
+      System.exit(1);
+    }
+
+    SparkSession spark =
+        SparkSession.builder().appName("DFP Cluster Rebuilder: " + sortBy).getOrCreate();
+
+    try {
+      System.out.println("=== DFP Cluster Rebuilder ===");
+      System.out.println("Source:   " + src);
+      System.out.println("Dest:     " + dst);
+      System.out.println("Sort by:  " + sortBy);
+      long start = System.currentTimeMillis();
+
+      Dataset<Row> df = spark.read().format("lance").load(src);
+      long rowCount = df.count();
+      System.out.println("Rows:     " + rowCount);
+
+      // sortWithinPartitions keeps fragment locality roughly intact; a full orderBy would
+      // introduce a shuffle + reduce partitioning, which we don't want for the fixture.
+      // For DFP's purposes what matters is intra-fragment clustering, which sortWithinPartitions
+      // achieves given the default Spark partitioning = one fragment per partition on read.
+      df.sortWithinPartitions(sortBy)
+          .write()
+          .mode(SaveMode.ErrorIfExists)
+          .format("lance")
+          .save(dst);
+
+      long elapsed = System.currentTimeMillis() - start;
+      System.out.printf("Wrote %s in %d ms%n", dst, elapsed);
+    } finally {
+      spark.stop();
+    }
+  }
+
+  private static void printUsage() {
+    System.err.println(
+        "Usage: DfpClusterRebuilder --src <path> --dst <path> --sort-by <column>");
+  }
+}