Skip to content

Commit a598134

Browse files
committed
ALP: Remove duplicate CSV test data from parquet-column
Load Spotify CSV files from parquet-hadoop/src/test/resources/ instead of duplicating them into parquet-column/src/test/resources/. The benchmark now resolves the CSV directory relative to the project root.
1 parent 9c1526b commit a598134

File tree

3 files changed

+31
-30016
lines changed

3 files changed

+31
-30016
lines changed

parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@
1919
package org.apache.parquet.column.values.alp.benchmark;
2020

2121
import java.io.BufferedReader;
22+
import java.io.FileInputStream;
2223
import java.io.IOException;
2324
import java.io.InputStream;
2425
import java.io.InputStreamReader;
2526
import java.nio.ByteBuffer;
27+
import java.nio.file.Files;
28+
import java.nio.file.Path;
29+
import java.nio.file.Paths;
2630
import java.util.ArrayList;
2731
import java.util.List;
2832
import org.apache.parquet.bytes.ByteBufferInputStream;
@@ -48,6 +52,7 @@ public class AlpCodecThroughput {
4852
private static final int WARMUP = 10;
4953
private static final int MEASURED = 30;
5054

55+
private static final String CSV_DIR = "parquet-hadoop/src/test/resources";
5156
private static final String DOUBLE_CSV = "alp_spotify1_expect.csv";
5257
private static final String FLOAT_CSV = "alp_float_spotify1_expect.csv";
5358

@@ -64,17 +69,17 @@ public class AlpCodecThroughput {
6469

6570
@BeforeClass
6671
public static void setup() throws IOException {
67-
// Load double columns from Spotify CSV
68-
double[][] rawDoubles = loadDoubleCsv(DOUBLE_CSV);
72+
Path csvDir = findCsvDir();
73+
74+
double[][] rawDoubles = loadDoubleCsv(csvDir.resolve(DOUBLE_CSV));
6975
doubleColumns = new double[rawDoubles.length][];
7076
doubleCompressed = new byte[rawDoubles.length][];
7177
for (int c = 0; c < rawDoubles.length; c++) {
7278
doubleColumns[c] = tile(rawDoubles[c], TARGET_VALUES);
7379
doubleCompressed[c] = compressDoubles(doubleColumns[c]);
7480
}
7581

76-
// Load float columns from Spotify CSV
77-
float[][] rawFloats = loadFloatCsv(FLOAT_CSV);
82+
float[][] rawFloats = loadFloatCsv(csvDir.resolve(FLOAT_CSV));
7883
floatColumns = new float[rawFloats.length][];
7984
floatCompressed = new byte[rawFloats.length][];
8085
for (int c = 0; c < rawFloats.length; c++) {
@@ -115,11 +120,27 @@ public void measureThroughput() throws IOException {
115120

116121
// ========== CSV loading ==========
117122

118-
private static double[][] loadDoubleCsv(String resource) throws IOException {
119-
try (InputStream is = AlpCodecThroughput.class.getClassLoader().getResourceAsStream(resource)) {
120-
if (is == null) {
121-
throw new IOException("Resource not found: " + resource);
123+
/**
124+
* Find the CSV directory. Searches from the working directory upward for the
125+
* parquet-hadoop test resources directory, so the benchmark works whether run
126+
* from the project root or from parquet-column/.
127+
*/
128+
private static Path findCsvDir() throws IOException {
129+
Path dir = Paths.get("").toAbsolutePath();
130+
for (int i = 0; i < 3; i++) {
131+
Path candidate = dir.resolve(CSV_DIR);
132+
if (Files.isDirectory(candidate) && Files.exists(candidate.resolve(DOUBLE_CSV))) {
133+
return candidate;
122134
}
135+
dir = dir.getParent();
136+
if (dir == null) break;
137+
}
138+
throw new IOException("Cannot find CSV directory '" + CSV_DIR
139+
+ "'. Run from the parquet-java project root.");
140+
}
141+
142+
private static double[][] loadDoubleCsv(Path csvPath) throws IOException {
143+
try (InputStream is = new FileInputStream(csvPath.toFile())) {
123144
BufferedReader br = new BufferedReader(new InputStreamReader(is));
124145
String header = br.readLine();
125146
int numCols = header.split(",").length;
@@ -135,7 +156,6 @@ private static double[][] loadDoubleCsv(String resource) throws IOException {
135156
rows.add(row);
136157
}
137158

138-
// Transpose: rows -> columns
139159
double[][] columns = new double[numCols][rows.size()];
140160
for (int r = 0; r < rows.size(); r++) {
141161
double[] row = rows.get(r);
@@ -147,11 +167,8 @@ private static double[][] loadDoubleCsv(String resource) throws IOException {
147167
}
148168
}
149169

150-
private static float[][] loadFloatCsv(String resource) throws IOException {
151-
try (InputStream is = AlpCodecThroughput.class.getClassLoader().getResourceAsStream(resource)) {
152-
if (is == null) {
153-
throw new IOException("Resource not found: " + resource);
154-
}
170+
private static float[][] loadFloatCsv(Path csvPath) throws IOException {
171+
try (InputStream is = new FileInputStream(csvPath.toFile())) {
155172
BufferedReader br = new BufferedReader(new InputStreamReader(is));
156173
String header = br.readLine();
157174
int numCols = header.split(",").length;

0 commit comments

Comments
 (0)