Skip to content

Commit 7190ab6

Browse files
authored
GH-3372: Enhance ColumnSizeCommand to sort by size (#3371)
- Support sorting by column sizes in the descending order. - Support printing ratio as percentage.
1 parent c7aeaaa commit 7190ab6

File tree

1 file changed

+67
-5
lines changed

1 file changed

+67
-5
lines changed

parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@
2424
import com.google.common.base.Preconditions;
2525
import com.google.common.collect.Lists;
2626
import java.io.IOException;
27+
import java.util.ArrayList;
2728
import java.util.HashMap;
29+
import java.util.LinkedHashMap;
2830
import java.util.List;
31+
import java.util.Locale;
2932
import java.util.Map;
3033
import org.apache.hadoop.conf.Configuration;
3134
import org.apache.hadoop.fs.Path;
@@ -56,6 +59,18 @@ public ColumnSizeCommand(Logger console) {
5659
required = false)
5760
List<String> columns;
5861

62+
@Parameter(
63+
names = {"-s", "--sort"},
64+
description = "Sort columns by size in descending order",
65+
required = false)
66+
boolean sortBySize = false;
67+
68+
@Parameter(
69+
names = {"-p", "--percentage"},
70+
description = "Print ratio as percentage instead of decimal",
71+
required = false)
72+
boolean printAsPercentage = false;
73+
5974
@Override
6075
@SuppressWarnings("unchecked")
6176
public int run() throws IOException {
@@ -67,6 +82,10 @@ public int run() throws IOException {
6782

6883
// If user defined columns, only print out size for those columns
6984
if (columns != null && !columns.isEmpty()) {
85+
// Collect aggregated column data
86+
Map<String, Long> aggregatedSizes = new LinkedHashMap<>();
87+
Map<String, Float> aggregatedRatios = new LinkedHashMap<>();
88+
7089
for (String inputColumn : columns) {
7190
long size = 0;
7291
float ratio = 0;
@@ -76,18 +95,52 @@ public int run() throws IOException {
7695
ratio += columnRatio.get(column);
7796
}
7897
}
79-
console.info(inputColumn + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratio);
98+
aggregatedSizes.put(inputColumn, size);
99+
aggregatedRatios.put(inputColumn, ratio);
100+
}
101+
102+
// Sort if requested
103+
List<Map.Entry<String, Long>> entries = new ArrayList<>(aggregatedSizes.entrySet());
104+
if (sortBySize) {
105+
entries.sort(Map.Entry.<String, Long>comparingByValue().reversed());
106+
}
107+
108+
// Print results
109+
for (Map.Entry<String, Long> entry : entries) {
110+
String column = entry.getKey();
111+
long size = entry.getValue();
112+
float ratio = aggregatedRatios.get(column);
113+
String ratioStr = formatRatio(ratio);
114+
console.info(column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr);
80115
}
81116
} else {
82-
for (String column : columnSizes.keySet()) {
83-
console.info(column + "->" + " Size In Bytes: " + columnSizes.get(column) + " Size In Ratio: "
84-
+ columnRatio.get(column));
117+
// Sort if requested
118+
List<Map.Entry<String, Long>> entries = new ArrayList<>(columnSizes.entrySet());
119+
if (sortBySize) {
120+
entries.sort(Map.Entry.<String, Long>comparingByValue().reversed());
121+
}
122+
123+
// Print results
124+
for (Map.Entry<String, Long> entry : entries) {
125+
String column = entry.getKey();
126+
long size = entry.getValue();
127+
float ratio = columnRatio.get(column);
128+
String ratioStr = formatRatio(ratio);
129+
console.info(column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr);
85130
}
86131
}
87132

88133
return 0;
89134
}
90135

136+
private String formatRatio(float ratio) {
137+
if (printAsPercentage) {
138+
return String.format(Locale.US, "%.4f%%", ratio * 100);
139+
} else {
140+
return String.valueOf(ratio);
141+
}
142+
}
143+
91144
@Override
92145
public List<String> getExamples() {
93146
return Lists.newArrayList(
@@ -96,7 +149,16 @@ public List<String> getExamples() {
96149
"sample.parquet -c col_1",
97150
"sample.parquet --column col_2",
98151
"sample.parquet --columns col_1 col_2",
99-
"sample.parquet --columns col_1 col_2.sub_col_a");
152+
"sample.parquet --columns col_1 col_2.sub_col_a",
153+
"# Sort columns by size in descending order",
154+
"sample.parquet --sort",
155+
"sample.parquet -s",
156+
"# Print ratio as percentage",
157+
"sample.parquet --percentage",
158+
"sample.parquet -p",
159+
"# Combine sorting and percentage formatting",
160+
"sample.parquet --sort --percentage",
161+
"sample.parquet -s -p -c col_1 col_2");
100162
}
101163

102164
// Make it public to allow some automation tools to call it

0 commit comments

Comments
 (0)