apache
diff --git a/‎parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java‎
Lines changed: 144 additions & 71 deletions b/‎parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java‎
Lines changed: 144 additions & 71 deletions
@@ -18,9 +18,13 @@
  */
 package org.apache.parquet.column.values.alp.benchmark;
 
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.ByteBuffer;
-import java.util.Random;
+import java.util.ArrayList;
+import java.util.List;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble;
 import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat;
@@ -29,120 +33,187 @@
 import org.junit.Test;
 
 /**
- * Codec-level ALP throughput benchmark reporting MB/s.
+ * Codec-level ALP throughput benchmark using real Spotify dataset columns.
  *
  * <p>Comparable to C++ encoding_alp_benchmark.cc. Measures encode and decode
- * throughput at the codec level (no Parquet pipeline overhead).
+ * throughput at the codec level (no Parquet pipeline overhead). Uses the same
+ * Spotify audio features dataset as the C++ benchmark for direct comparison.
+ *
+ * <p>The CSV source has 15K rows per column; values are tiled to 1M for stable
+ * measurement.
  */
 public class AlpCodecThroughput {
 
-  private static final int NUM_VALUES = 1_000_000;
+  private static final int TARGET_VALUES = 1_000_000;
   private static final int WARMUP = 10;
   private static final int MEASURED = 30;
 
-  // Datasets
-  private static double[] doubleDecimal;
-  private static double[] doubleInteger;
-  private static double[] doubleMixed;
-  private static float[] floatDecimal;
-  private static float[] floatInteger;
-  private static float[] floatMixed;
-
-  // Pre-compressed
-  private static byte[] doubleDecimalComp;
-  private static byte[] doubleIntegerComp;
-  private static byte[] doubleMixedComp;
-  private static byte[] floatDecimalComp;
-  private static byte[] floatIntegerComp;
-  private static byte[] floatMixedComp;
+  private static final String DOUBLE_CSV = "alp_spotify1_expect.csv";
+  private static final String FLOAT_CSV = "alp_float_spotify1_expect.csv";
+
+  // Spotify column names matching C++ benchmark
+  private static final String[] COLUMNS = {
+    "valence", "danceability", "energy", "loudness", "speechiness",
+    "acousticness", "instrumentalness", "liveness", "tempo"
+  };
+
+  private static double[][] doubleColumns;
+  private static float[][] floatColumns;
+  private static byte[][] doubleCompressed;
+  private static byte[][] floatCompressed;
 
   @BeforeClass
   public static void setup() throws IOException {
-    Random rng = new Random(42);
-
-    doubleDecimal = new double[NUM_VALUES];
-    for (int i = 0; i < NUM_VALUES; i++) {
-      doubleDecimal[i] = Math.round(rng.nextDouble() * 10000) / 100.0;
+    // Load double columns from Spotify CSV
+    double[][] rawDoubles = loadDoubleCsv(DOUBLE_CSV);
+    doubleColumns = new double[rawDoubles.length][];
+    doubleCompressed = new byte[rawDoubles.length][];
+    for (int c = 0; c < rawDoubles.length; c++) {
+      doubleColumns[c] = tile(rawDoubles[c], TARGET_VALUES);
+      doubleCompressed[c] = compressDoubles(doubleColumns[c]);
     }
 
-    doubleInteger = new double[NUM_VALUES];
-    for (int i = 0; i < NUM_VALUES; i++) {
-      doubleInteger[i] = (double) rng.nextInt(100000);
+    // Load float columns from Spotify CSV
+    float[][] rawFloats = loadFloatCsv(FLOAT_CSV);
+    floatColumns = new float[rawFloats.length][];
+    floatCompressed = new byte[rawFloats.length][];
+    for (int c = 0; c < rawFloats.length; c++) {
+      floatColumns[c] = tile(rawFloats[c], TARGET_VALUES);
+      floatCompressed[c] = compressFloats(floatColumns[c]);
     }
+  }
 
-    doubleMixed = new double[NUM_VALUES];
-    for (int i = 0; i < NUM_VALUES; i++) {
-      doubleMixed[i] = Math.round(rng.nextDouble() * 10000) / 100.0;
-    }
-    for (int i = 0; i < NUM_VALUES; i += 50) {
-      doubleMixed[i] = Double.NaN;
-    }
+  @Test
+  public void measureThroughput() throws IOException {
+    System.out.println();
+    System.out.printf("=== ALP Codec-Level Throughput (%dK values, Spotify dataset) ===%n",
+        TARGET_VALUES / 1000);
+    System.out.println();
 
-    floatDecimal = new float[NUM_VALUES];
-    for (int i = 0; i < NUM_VALUES; i++) {
-      floatDecimal[i] = Math.round(rng.nextFloat() * 10000) / 100.0f;
+    // Double columns
+    System.out.printf("%-30s %10s %10s %10s %10s%n",
+        "Double Column", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB");
+    System.out.println("------------------------------"
+        + " ---------- ---------- ---------- ----------");
+    for (int c = 0; c < doubleColumns.length; c++) {
+      benchDouble(COLUMNS[c], doubleColumns[c], doubleCompressed[c]);
     }
 
-    floatInteger = new float[NUM_VALUES];
-    for (int i = 0; i < NUM_VALUES; i++) {
-      floatInteger[i] = (float) rng.nextInt(100000);
-    }
+    System.out.println();
 
-    floatMixed = new float[NUM_VALUES];
-    for (int i = 0; i < NUM_VALUES; i++) {
-      floatMixed[i] = Math.round(rng.nextFloat() * 10000) / 100.0f;
+    // Float columns
+    System.out.printf("%-30s %10s %10s %10s %10s%n",
+        "Float Column", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB");
+    System.out.println("------------------------------"
+        + " ---------- ---------- ---------- ----------");
+    for (int c = 0; c < floatColumns.length; c++) {
+      benchFloat(COLUMNS[c], floatColumns[c], floatCompressed[c]);
     }
-    for (int i = 0; i < NUM_VALUES; i += 50) {
-      floatMixed[i] = Float.NaN;
+
+    System.out.println();
+  }
+
+  // ========== CSV loading ==========
+
+  private static double[][] loadDoubleCsv(String resource) throws IOException {
+    try (InputStream is = AlpCodecThroughput.class.getClassLoader().getResourceAsStream(resource)) {
+      if (is == null) {
+        throw new IOException("Resource not found: " + resource);
+      }
+      BufferedReader br = new BufferedReader(new InputStreamReader(is));
+      String header = br.readLine();
+      int numCols = header.split(",").length;
+
+      List<double[]> rows = new ArrayList<>();
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] parts = line.split(",");
+        double[] row = new double[numCols];
+        for (int i = 0; i < numCols; i++) {
+          row[i] = Double.parseDouble(parts[i]);
+        }
+        rows.add(row);
+      }
+
+      // Transpose: rows -> columns
+      double[][] columns = new double[numCols][rows.size()];
+      for (int r = 0; r < rows.size(); r++) {
+        double[] row = rows.get(r);
+        for (int c = 0; c < numCols; c++) {
+          columns[c][r] = row[c];
+        }
+      }
+      return columns;
     }
+  }
 
-    doubleDecimalComp = compressDoubles(doubleDecimal);
-    doubleIntegerComp = compressDoubles(doubleInteger);
-    doubleMixedComp = compressDoubles(doubleMixed);
-    floatDecimalComp = compressFloats(floatDecimal);
-    floatIntegerComp = compressFloats(floatInteger);
-    floatMixedComp = compressFloats(floatMixed);
+  private static float[][] loadFloatCsv(String resource) throws IOException {
+    try (InputStream is = AlpCodecThroughput.class.getClassLoader().getResourceAsStream(resource)) {
+      if (is == null) {
+        throw new IOException("Resource not found: " + resource);
+      }
+      BufferedReader br = new BufferedReader(new InputStreamReader(is));
+      String header = br.readLine();
+      int numCols = header.split(",").length;
+
+      List<float[]> rows = new ArrayList<>();
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] parts = line.split(",");
+        float[] row = new float[numCols];
+        for (int i = 0; i < numCols; i++) {
+          row[i] = Float.parseFloat(parts[i]);
+        }
+        rows.add(row);
+      }
+
+      float[][] columns = new float[numCols][rows.size()];
+      for (int r = 0; r < rows.size(); r++) {
+        float[] row = rows.get(r);
+        for (int c = 0; c < numCols; c++) {
+          columns[c][r] = row[c];
+        }
+      }
+      return columns;
+    }
   }
 
-  @Test
-  public void measureThroughput() throws IOException {
-    System.out.println();
-    System.out.println("=== ALP Codec-Level Throughput (1M values) ===");
-    System.out.printf("%-30s %10s %10s %10s %10s%n",
-        "Dataset", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB");
-    System.out.println("------------------------------" +
-        " ---------- ---------- ---------- ----------");
+  // ========== Tiling ==========
 
-    benchDouble("double_decimal", doubleDecimal, doubleDecimalComp);
-    benchDouble("double_integer", doubleInteger, doubleIntegerComp);
-    benchDouble("double_mixed(2%exc)", doubleMixed, doubleMixedComp);
-    benchFloat("float_decimal", floatDecimal, floatDecimalComp);
-    benchFloat("float_integer", floatInteger, floatIntegerComp);
-    benchFloat("float_mixed(2%exc)", floatMixed, floatMixedComp);
+  private static double[] tile(double[] source, int targetSize) {
+    double[] result = new double[targetSize];
+    for (int i = 0; i < targetSize; i++) {
+      result[i] = source[i % source.length];
+    }
+    return result;
+  }
 
-    System.out.println();
+  private static float[] tile(float[] source, int targetSize) {
+    float[] result = new float[targetSize];
+    for (int i = 0; i < targetSize; i++) {
+      result[i] = source[i % source.length];
+    }
+    return result;
   }
 
+  // ========== Benchmark methods ==========
+
   private void benchDouble(String name, double[] data, byte[] compressed) throws IOException {
     long rawBytes = (long) data.length * Double.BYTES;
 
-    // Warmup encode
     for (int i = 0; i < WARMUP; i++) {
       compressDoubles(data);
     }
-    // Measure encode
     long encNanos = 0;
     for (int i = 0; i < MEASURED; i++) {
       long t0 = System.nanoTime();
       compressDoubles(data);
       encNanos += System.nanoTime() - t0;
     }
 
-    // Warmup decode
     for (int i = 0; i < WARMUP; i++) {
       decompressDoubles(compressed, data.length);
     }
-    // Measure decode
     long decNanos = 0;
     for (int i = 0; i < MEASURED; i++) {
       long t0 = System.nanoTime();
@@ -187,6 +258,8 @@ private void benchFloat(String name, float[] data, byte[] compressed) throws IOE
         name, encMBps, decMBps, rawBytes / 1024, compressed.length / 1024);
   }
 
+  // ========== Compress / Decompress ==========
+
   private static byte[] compressDoubles(double[] values) throws IOException {
     AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter();
     for (double v : values) {