2424import com .google .common .base .Preconditions ;
2525import com .google .common .collect .Lists ;
2626import java .io .IOException ;
27+ import java .util .ArrayList ;
2728import java .util .HashMap ;
29+ import java .util .LinkedHashMap ;
2830import java .util .List ;
31+ import java .util .Locale ;
2932import java .util .Map ;
3033import org .apache .hadoop .conf .Configuration ;
3134import org .apache .hadoop .fs .Path ;
@@ -56,6 +59,18 @@ public ColumnSizeCommand(Logger console) {
5659 required = false )
5760 List <String > columns ;
5861
62+ @ Parameter (
63+ names = {"-s" , "--sort" },
64+ description = "Sort columns by size in descending order" ,
65+ required = false )
66+ boolean sortBySize = false ;
67+
68+ @ Parameter (
69+ names = {"-p" , "--percentage" },
70+ description = "Print ratio as percentage instead of decimal" ,
71+ required = false )
72+ boolean printAsPercentage = false ;
73+
5974 @ Override
6075 @ SuppressWarnings ("unchecked" )
6176 public int run () throws IOException {
@@ -67,6 +82,10 @@ public int run() throws IOException {
6782
6883 // If user defined columns, only print out size for those columns
6984 if (columns != null && !columns .isEmpty ()) {
85+ // Collect aggregated column data
86+ Map <String , Long > aggregatedSizes = new LinkedHashMap <>();
87+ Map <String , Float > aggregatedRatios = new LinkedHashMap <>();
88+
7089 for (String inputColumn : columns ) {
7190 long size = 0 ;
7291 float ratio = 0 ;
@@ -76,18 +95,52 @@ public int run() throws IOException {
7695 ratio += columnRatio .get (column );
7796 }
7897 }
79- console .info (inputColumn + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratio );
98+ aggregatedSizes .put (inputColumn , size );
99+ aggregatedRatios .put (inputColumn , ratio );
100+ }
101+
102+ // Sort if requested
103+ List <Map .Entry <String , Long >> entries = new ArrayList <>(aggregatedSizes .entrySet ());
104+ if (sortBySize ) {
105+ entries .sort (Map .Entry .<String , Long >comparingByValue ().reversed ());
106+ }
107+
108+ // Print results
109+ for (Map .Entry <String , Long > entry : entries ) {
110+ String column = entry .getKey ();
111+ long size = entry .getValue ();
112+ float ratio = aggregatedRatios .get (column );
113+ String ratioStr = formatRatio (ratio );
114+ console .info (column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr );
80115 }
81116 } else {
82- for (String column : columnSizes .keySet ()) {
83- console .info (column + "->" + " Size In Bytes: " + columnSizes .get (column ) + " Size In Ratio: "
84- + columnRatio .get (column ));
117+ // Sort if requested
118+ List <Map .Entry <String , Long >> entries = new ArrayList <>(columnSizes .entrySet ());
119+ if (sortBySize ) {
120+ entries .sort (Map .Entry .<String , Long >comparingByValue ().reversed ());
121+ }
122+
123+ // Print results
124+ for (Map .Entry <String , Long > entry : entries ) {
125+ String column = entry .getKey ();
126+ long size = entry .getValue ();
127+ float ratio = columnRatio .get (column );
128+ String ratioStr = formatRatio (ratio );
129+ console .info (column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr );
85130 }
86131 }
87132
88133 return 0 ;
89134 }
90135
136+ private String formatRatio (float ratio ) {
137+ if (printAsPercentage ) {
138+ return String .format (Locale .US , "%.4f%%" , ratio * 100 );
139+ } else {
140+ return String .valueOf (ratio );
141+ }
142+ }
143+
91144 @ Override
92145 public List <String > getExamples () {
93146 return Lists .newArrayList (
@@ -96,7 +149,16 @@ public List<String> getExamples() {
96149 "sample.parquet -c col_1" ,
97150 "sample.parquet --column col_2" ,
98151 "sample.parquet --columns col_1 col_2" ,
99- "sample.parquet --columns col_1 col_2.sub_col_a" );
152+ "sample.parquet --columns col_1 col_2.sub_col_a" ,
153+ "# Sort columns by size in descending order" ,
154+ "sample.parquet --sort" ,
155+ "sample.parquet -s" ,
156+ "# Print ratio as percentage" ,
157+ "sample.parquet --percentage" ,
158+ "sample.parquet -p" ,
159+ "# Combine sorting and percentage formatting" ,
160+ "sample.parquet --sort --percentage" ,
161+ "sample.parquet -s -p -c col_1 col_2" );
100162 }
101163
102164 // Make it public to allow some automation tools to call it
0 commit comments