Skip to content

Commit d4a2f16

Browse files
authored
HIVE-29617: Error while loading column statistics of Iceberg table after upgrading Hive (apache#6496)
1 parent 99b61cc commit d4a2f16

2 files changed

Lines changed: 90 additions & 12 deletions

File tree

ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -239,22 +239,33 @@ public static long getNumRows(HiveConf conf, List<ColumnInfo> schema, Table tabl
239239
return aggregateStat.getNumRows();
240240
}
241241

242-
private static void estimateStatsForMissingCols(List<String> neededColumns, List<ColStatistics> columnStats,
243-
HiveConf conf, long nr, List<ColumnInfo> schema) {
242+
/**
243+
* Estimates column statistics for columns specified in {@code neededColumnNames}
244+
* that do not already have statistics in the {@code existingColStats} list.
245+
*
246+
* @return A {@link List} of {@link ColStatistics} objects containing
247+
* both the provided existing statistics and the newly estimated ones.
248+
*/
249+
static List<ColStatistics> estimateStatsForMissingCols(
250+
List<String> neededColumnNames, List<ColStatistics> existingColStats, HiveConf conf, long nr,
251+
List<ColumnInfo> schema) {
244252

245-
Set<String> neededCols = new HashSet<>(neededColumns);
246-
Set<String> colsWithStats = new HashSet<>();
253+
Set<String> neededCols = new HashSet<>(neededColumnNames);
254+
Set<String> columnNamesWithStats = HashSet.newHashSet(existingColStats.size());
247255

248-
for (ColStatistics cstats : columnStats) {
249-
colsWithStats.add(cstats.getColumnName());
256+
for (ColStatistics cstats : existingColStats) {
257+
columnNamesWithStats.add(cstats.getColumnName());
250258
}
251259

252-
List<String> missingColStats = new ArrayList<>(Sets.difference(neededCols, colsWithStats));
260+
List<String> missingColumnNames = new ArrayList<>(Sets.difference(neededCols, columnNamesWithStats));
261+
ArrayList<ColStatistics> combined = new ArrayList<>(existingColStats.size() + missingColumnNames.size());
262+
combined.addAll(existingColStats);
253263

254-
if (!missingColStats.isEmpty()) {
255-
columnStats.addAll(
256-
estimateStats(schema, missingColStats, conf, nr));
264+
if (!missingColumnNames.isEmpty()) {
265+
combined.addAll(estimateStats(schema, missingColumnNames, conf, nr));
257266
}
267+
268+
return combined;
258269
}
259270

260271
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
@@ -300,7 +311,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p
300311
if (needColStats && !metaTable) {
301312
colStats = getTableColumnStats(table, neededColumns, colStatsCache, fetchColStats);
302313
if (estimateStats) {
303-
estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema);
314+
colStats = estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema);
304315
}
305316
// we should have stats for all columns (estimated or actual)
306317
if (neededColumns.size() == colStats.size()) {
@@ -386,7 +397,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p
386397
boolean statsRetrieved = aggrStats != null &&
387398
aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
388399
if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
389-
estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema);
400+
columnStats = estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema);
390401
// There are some partitions with no state (or we didn't fetch any state).
391402
// Update the stats with empty list to reflect that in the
392403
// state/initialize structures.

ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
package org.apache.hadoop.hive.ql.stats;
2020

21+
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertTrue;
2123
import static org.junit.jupiter.api.Assertions.assertEquals;
2224
import static org.junit.jupiter.api.Assertions.assertNotEquals;
2325
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -41,10 +43,12 @@
4143
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
4244
import org.apache.hadoop.hive.metastore.api.Timestamp;
4345
import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData;
46+
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
4447
import org.apache.hadoop.hive.ql.plan.ColStatistics;
4548
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
4649
import org.apache.hadoop.hive.ql.plan.Statistics;
4750
import org.apache.hadoop.hive.serde.serdeConstants;
51+
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
4852
import org.junit.jupiter.api.Test;
4953
import org.junit.jupiter.params.ParameterizedTest;
5054
import org.junit.jupiter.params.provider.Arguments;
@@ -565,4 +569,67 @@ void testGetColStatisticsTimestampType() {
565569
assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch for TIMESTAMP");
566570
}
567571

572+
@Test
573+
void testEstimateStatsForMissingColsHandlesEmptyList() {
574+
HiveConf conf = new HiveConf();
575+
576+
ColumnInfo columnInfoA = new ColumnInfo("a", TypeInfoFactory.intTypeInfo, "t", false);
577+
578+
List<ColStatistics> allColumnStats = StatsUtils.estimateStatsForMissingCols(
579+
List.of("a"), Collections.emptyList(), conf, 0, List.of(columnInfoA));
580+
581+
assertEquals(1, allColumnStats.size());
582+
}
583+
584+
@Test
585+
void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() {
586+
HiveConf conf = new HiveConf();
587+
588+
ColumnInfo colNeededButNotExists = new ColumnInfo("neededButNotExists", TypeInfoFactory.intTypeInfo, "t", false);
589+
ColumnInfo colNeededAndExists = new ColumnInfo("neededAndExists", TypeInfoFactory.intTypeInfo, "t", false);
590+
ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", TypeInfoFactory.intTypeInfo, "t", false);
591+
ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", TypeInfoFactory.intTypeInfo, "t", false);
592+
593+
ColStatistics colStatNeededAndExists = new ColStatistics();
594+
colStatNeededAndExists.setColumnName(colNeededAndExists.getInternalName());
595+
ColStatistics colStatNotNeededButExists = new ColStatistics();
596+
colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName());
597+
598+
List<ColStatistics> allColumnStats = StatsUtils.estimateStatsForMissingCols(
599+
List.of(colNeededAndExists.getInternalName(), colNeededButNotExists.getInternalName()),
600+
List.of(colStatNeededAndExists, colStatNotNeededButExists),
601+
conf,
602+
0,
603+
List.of(colNeededButNotExists, colNeededAndExists, colNotNeededButExists, colNotNeededNotExists));
604+
605+
assertEquals(3, allColumnStats.size());
606+
assertEquals(colStatNeededAndExists, allColumnStats.get(0));
607+
assertFalse(allColumnStats.get(0).isEstimated());
608+
assertEquals(colStatNotNeededButExists, allColumnStats.get(1));
609+
assertFalse(allColumnStats.get(1).isEstimated());
610+
assertEquals(colNeededButNotExists.getInternalName(), allColumnStats.get(2).getColumnName());
611+
assertTrue(allColumnStats.get(2).isEstimated());
612+
}
613+
614+
@Test
615+
void testEstimateStatsForMissingColsReturnOnlyColumnsWithExistingStatsWhenNoNeededColumn() {
616+
HiveConf conf = new HiveConf();
617+
618+
ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", TypeInfoFactory.intTypeInfo, "t", false);
619+
ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", TypeInfoFactory.intTypeInfo, "t", false);
620+
621+
ColStatistics colStatNotNeededButExists = new ColStatistics();
622+
colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName());
623+
624+
List<ColStatistics> allColumnStats = StatsUtils.estimateStatsForMissingCols(
625+
Collections.emptyList(),
626+
List.of(colStatNotNeededButExists),
627+
conf,
628+
0,
629+
List.of(colNotNeededButExists, colNotNeededNotExists));
630+
631+
assertEquals(1, allColumnStats.size());
632+
assertEquals(allColumnStats.getFirst(), colStatNotNeededButExists);
633+
}
634+
568635
}

0 commit comments

Comments
 (0)