Skip to content

Commit c4c90eb

Browse files
committed
ALP: Gzip compress CSV test data and regenerate float parquet files
Compress the 4 expect CSV files with gzip (5.3 MB -> 1.4 MB) and update all readers (TestInteropAlpEncoding, GenerateAlpParquet, AlpCodecThroughput) to decompress via GZIPInputStream. Also regenerate the C++ and Java float parquet test files which had stale/invalid page headers, and remove the Hadoop CRC files that were causing checksum errors.
1 parent b93c3b1 commit c4c90eb

15 files changed

+30
-60027
lines changed

parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.io.IOException;
2424
import java.io.InputStream;
2525
import java.io.InputStreamReader;
26+
import java.util.zip.GZIPInputStream;
2627
import java.nio.ByteBuffer;
2728
import java.nio.file.Files;
2829
import java.nio.file.Path;
@@ -53,8 +54,8 @@ public class AlpCodecThroughput {
5354
private static final int MEASURED = 30;
5455

5556
private static final String CSV_DIR = "parquet-hadoop/src/test/resources";
56-
private static final String DOUBLE_CSV = "alp_spotify1_expect.csv";
57-
private static final String FLOAT_CSV = "alp_float_spotify1_expect.csv";
57+
private static final String DOUBLE_CSV = "alp_spotify1_expect.csv.gz";
58+
private static final String FLOAT_CSV = "alp_float_spotify1_expect.csv.gz";
5859

5960
// Spotify column names matching C++ benchmark
6061
private static final String[] COLUMNS = {
@@ -140,7 +141,7 @@ private static Path findCsvDir() throws IOException {
140141
}
141142

142143
private static double[][] loadDoubleCsv(Path csvPath) throws IOException {
143-
try (InputStream is = new FileInputStream(csvPath.toFile())) {
144+
try (InputStream is = new GZIPInputStream(new FileInputStream(csvPath.toFile()))) {
144145
BufferedReader br = new BufferedReader(new InputStreamReader(is));
145146
String header = br.readLine();
146147
int numCols = header.split(",").length;
@@ -168,7 +169,7 @@ private static double[][] loadDoubleCsv(Path csvPath) throws IOException {
168169
}
169170

170171
private static float[][] loadFloatCsv(Path csvPath) throws IOException {
171-
try (InputStream is = new FileInputStream(csvPath.toFile())) {
172+
try (InputStream is = new GZIPInputStream(new FileInputStream(csvPath.toFile()))) {
172173
BufferedReader br = new BufferedReader(new InputStreamReader(is));
173174
String header = br.readLine();
174175
int numCols = header.split(",").length;

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.io.InputStream;
2424
import java.io.InputStreamReader;
2525
import java.nio.charset.StandardCharsets;
26+
import java.util.zip.GZIPInputStream;
2627
import java.nio.file.Files;
2728
import java.nio.file.Paths;
2829
import java.util.ArrayList;
@@ -38,7 +39,7 @@
3839
/**
3940
* Standalone utility to generate ALP-encoded parquet files from CSV test data.
4041
*
41-
* <p>Reads the existing expect CSV files (alp_spotify1_expect.csv, alp_arade_expect.csv)
42+
* <p>Reads the existing expect CSV files (alp_spotify1_expect.csv.gz, alp_arade_expect.csv.gz)
4243
* from test resources and writes ALP-encoded parquet files using the Java ALP encoder.
4344
*
4445
* <p>Usage: java GenerateAlpParquet [output_directory]
@@ -50,18 +51,18 @@ public static void main(String[] args) throws IOException {
5051
String outputDir = args.length > 0 ? args[0] : ".";
5152
Files.createDirectories(Paths.get(outputDir));
5253

53-
generateAlpParquet("/alp_arade_expect.csv", outputDir + "/alp_java_arade.parquet");
54+
generateAlpParquet("/alp_arade_expect.csv.gz", outputDir + "/alp_java_arade.parquet");
5455
System.out.println("Generated: " + outputDir + "/alp_java_arade.parquet");
5556

56-
generateAlpParquet("/alp_spotify1_expect.csv", outputDir + "/alp_java_spotify1.parquet");
57+
generateAlpParquet("/alp_spotify1_expect.csv.gz", outputDir + "/alp_java_spotify1.parquet");
5758
System.out.println("Generated: " + outputDir + "/alp_java_spotify1.parquet");
5859

5960
generateAlpParquetFloat(
60-
"/alp_float_arade_expect.csv", outputDir + "/alp_java_float_arade.parquet");
61+
"/alp_float_arade_expect.csv.gz", outputDir + "/alp_java_float_arade.parquet");
6162
System.out.println("Generated: " + outputDir + "/alp_java_float_arade.parquet");
6263

6364
generateAlpParquetFloat(
64-
"/alp_float_spotify1_expect.csv", outputDir + "/alp_java_float_spotify1.parquet");
65+
"/alp_float_spotify1_expect.csv.gz", outputDir + "/alp_java_float_spotify1.parquet");
6566
System.out.println("Generated: " + outputDir + "/alp_java_float_spotify1.parquet");
6667
}
6768

@@ -70,7 +71,8 @@ private static void generateAlpParquet(String csvResource, String outputPath) th
7071
String[] columnNames;
7172
List<double[]> rows = new ArrayList<>();
7273

73-
try (InputStream is = GenerateAlpParquet.class.getResourceAsStream(csvResource);
74+
try (InputStream raw = GenerateAlpParquet.class.getResourceAsStream(csvResource);
75+
InputStream is = new GZIPInputStream(raw);
7476
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
7577
// Parse header
7678
String header = br.readLine();
@@ -126,7 +128,8 @@ private static void generateAlpParquetFloat(String csvResource, String outputPat
126128
String[] columnNames;
127129
List<float[]> rows = new ArrayList<>();
128130

129-
try (InputStream is = GenerateAlpParquet.class.getResourceAsStream(csvResource);
131+
try (InputStream raw = GenerateAlpParquet.class.getResourceAsStream(csvResource);
132+
InputStream is = new GZIPInputStream(raw);
130133
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
131134
// Parse header
132135
String header = br.readLine();

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.io.IOException;
2626
import java.io.InputStream;
2727
import java.io.InputStreamReader;
28+
import java.util.zip.GZIPInputStream;
2829
import java.net.URISyntaxException;
2930
import java.nio.charset.StandardCharsets;
3031
import java.util.ArrayList;
@@ -66,7 +67,7 @@ public void testReadAlpAradeParquet() throws IOException {
6667
int expectedRows = 15000;
6768

6869
// Read expected values from CSV
69-
double[][] expected = readExpectedCsv("/alp_arade_expect.csv", columnNames.length, expectedRows);
70+
double[][] expected = readExpectedCsv("/alp_arade_expect.csv.gz", columnNames.length, expectedRows);
7071

7172
// Read parquet file using GroupReadSupport
7273
List<Group> rows = readParquetGroups(parquetPath);
@@ -109,7 +110,7 @@ public void testReadAlpSpotify1Parquet() throws IOException {
109110
int expectedRows = 15000;
110111

111112
// Read expected values from CSV
112-
double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv", columnNames.length, expectedRows);
113+
double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv.gz", columnNames.length, expectedRows);
113114

114115
// Read parquet file using GroupReadSupport
115116
List<Group> rows = readParquetGroups(parquetPath);
@@ -141,7 +142,7 @@ public void testReadAlpJavaAradeParquet() throws IOException {
141142
String[] columnNames = {"value1", "value2", "value3", "value4"};
142143
int expectedRows = 15000;
143144

144-
double[][] expected = readExpectedCsv("/alp_arade_expect.csv", columnNames.length, expectedRows);
145+
double[][] expected = readExpectedCsv("/alp_arade_expect.csv.gz", columnNames.length, expectedRows);
145146

146147
List<Group> rows = readParquetGroups(parquetPath);
147148
assertEquals("Row count should match", expectedRows, rows.size());
@@ -180,7 +181,7 @@ public void testReadAlpJavaSpotify1Parquet() throws IOException {
180181
};
181182
int expectedRows = 15000;
182183

183-
double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv", columnNames.length, expectedRows);
184+
double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv.gz", columnNames.length, expectedRows);
184185

185186
List<Group> rows = readParquetGroups(parquetPath);
186187
assertEquals("Row count should match", expectedRows, rows.size());
@@ -209,7 +210,7 @@ public void testReadAlpFloatAradeParquet() throws IOException {
209210
String[] columnNames = {"value1", "value2", "value3", "value4"};
210211
int expectedRows = 15000;
211212

212-
float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv", columnNames.length, expectedRows);
213+
float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv.gz", columnNames.length, expectedRows);
213214

214215
List<Group> rows = readParquetGroups(parquetPath);
215216
assertEquals("Row count should match", expectedRows, rows.size());
@@ -248,7 +249,7 @@ public void testReadAlpFloatSpotify1Parquet() throws IOException {
248249
};
249250
int expectedRows = 15000;
250251

251-
float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv", columnNames.length, expectedRows);
252+
float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv.gz", columnNames.length, expectedRows);
252253

253254
List<Group> rows = readParquetGroups(parquetPath);
254255
assertEquals("Row count should match", expectedRows, rows.size());
@@ -277,7 +278,7 @@ public void testReadAlpJavaFloatAradeParquet() throws IOException {
277278
String[] columnNames = {"value1", "value2", "value3", "value4"};
278279
int expectedRows = 15000;
279280

280-
float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv", columnNames.length, expectedRows);
281+
float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv.gz", columnNames.length, expectedRows);
281282

282283
List<Group> rows = readParquetGroups(parquetPath);
283284
assertEquals("Row count should match", expectedRows, rows.size());
@@ -316,7 +317,7 @@ public void testReadAlpJavaFloatSpotify1Parquet() throws IOException {
316317
};
317318
int expectedRows = 15000;
318319

319-
float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv", columnNames.length, expectedRows);
320+
float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv.gz", columnNames.length, expectedRows);
320321

321322
List<Group> rows = readParquetGroups(parquetPath);
322323
assertEquals("Row count should match", expectedRows, rows.size());
@@ -368,9 +369,10 @@ private void verifyAlpEncoding(Path path) throws IOException {
368369
*/
369370
private double[][] readExpectedCsv(String resourcePath, int numColumns, int expectedRows) throws IOException {
370371
double[][] columns = new double[numColumns][expectedRows];
371-
try (InputStream is = getClass().getResourceAsStream(resourcePath);
372+
try (InputStream raw = getClass().getResourceAsStream(resourcePath);
373+
InputStream is = new GZIPInputStream(raw);
372374
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
373-
assertNotNull("CSV resource not found: " + resourcePath, is);
375+
assertNotNull("CSV resource not found: " + resourcePath, raw);
374376

375377
// Skip header
376378
String header = br.readLine();
@@ -397,9 +399,10 @@ private double[][] readExpectedCsv(String resourcePath, int numColumns, int expe
397399
*/
398400
private float[][] readExpectedCsvFloat(String resourcePath, int numColumns, int expectedRows) throws IOException {
399401
float[][] columns = new float[numColumns][expectedRows];
400-
try (InputStream is = getClass().getResourceAsStream(resourcePath);
402+
try (InputStream raw = getClass().getResourceAsStream(resourcePath);
403+
InputStream is = new GZIPInputStream(raw);
401404
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
402-
assertNotNull("CSV resource not found: " + resourcePath, is);
405+
assertNotNull("CSV resource not found: " + resourcePath, raw);
403406

404407
// Skip header
405408
String header = br.readLine();
-1.39 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
-2.38 KB
Binary file not shown.

0 commit comments

Comments
 (0)