Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 72 additions & 13 deletions ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.apache.commons.collections4.CollectionUtils;
Expand All @@ -38,6 +40,7 @@
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest;
import org.apache.hadoop.hive.ql.CompilationOpContext;
Expand Down Expand Up @@ -97,7 +100,8 @@ public int process(Hive db, Table tbl) throws Exception {
return persistColumnStats(db, tbl);
}

private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatistics> stats, long maxNumStats)
private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatistics> stats,
long maxNumStats, Map<String, List<String>> failedColumnStatsByTarget)
throws HiveException, MetaException, IOException {
String partName = null;
List<String> colName = colStatDesc.getColName();
Expand All @@ -118,6 +122,7 @@ private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatist

// Partition columns are appended at end, we only care about stats column
int pos = 0;
List<String> failedColumns = new ArrayList<>();
for (int i = 0; i < colName.size(); i++) {
String columnName = colName.get(i);
String columnType = colType.get(i);
Expand All @@ -133,13 +138,14 @@ private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatist
if (isStatsReliable) {
throw new HiveException("Statistics collection failed while (hive.stats.reliable)", e);
} else {
failedColumns.add(columnName);
LOG.debug("Because {} is infinite or NaN, we skip stats.", columnName, e);
}
}
pos += columnStatsFields.size();
}

if (!statsObjs.isEmpty()) {
if (!statsObjs.isEmpty() || !failedColumns.isEmpty()) {
if (!isTblLevel) {
List<FieldSchema> partColSchema = new ArrayList<>();
List<String> partVals = new ArrayList<>();
Expand Down Expand Up @@ -168,14 +174,22 @@ private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatist
partName = Warehouse.makePartName(partColSchema, partVals);
}

ColumnStatisticsDesc statsDesc = buildColumnStatsDesc(tbl, partName, isTblLevel);
ColumnStatistics colStats = new ColumnStatistics();
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
colStats.setEngine(Constants.HIVE_ENGINE);
stats.add(colStats);
if (numStats >= maxNumStats) {
return false;
if (!failedColumns.isEmpty()) {
String statsTarget = isTblLevel ? tbl.getFullyQualifiedName() : partName;
failedColumnStatsByTarget.computeIfAbsent(statsTarget, k -> new ArrayList<>())
.addAll(failedColumns);
}

if (!statsObjs.isEmpty()) {
ColumnStatisticsDesc statsDesc = buildColumnStatsDesc(tbl, partName, isTblLevel);
ColumnStatistics colStats = new ColumnStatistics();
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
colStats.setEngine(Constants.HIVE_ENGINE);
stats.add(colStats);
if (numStats >= maxNumStats) {
return false;
}
}
}
}
Expand Down Expand Up @@ -215,12 +229,43 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce
long maxNumStats = conf.getLongVar(HiveConf.ConfVars.HIVE_STATS_MAX_NUM_STATS);
while (!done) {
List<ColumnStatistics> colStats = new ArrayList<>();
Map<String, List<String>> failedColumnStatsByTarget = new HashMap<>();

long start = System. currentTimeMillis();
done = constructColumnStatsFromPackedRows(tbl, colStats, maxNumStats);
long start = System.currentTimeMillis();
done = constructColumnStatsFromPackedRows(tbl, colStats, maxNumStats, failedColumnStatsByTarget);
long end = System.currentTimeMillis();
LOG.info("Time taken to build " + colStats.size() + " stats desc : " + ((end - start)/1000F) + " seconds.");

// Remove inaccurate column stats markers
List<Partition> partitionsToUpdate = new ArrayList<>();
for (Map.Entry<String, List<String>> entry : failedColumnStatsByTarget.entrySet()) {
List<String> failedColumns = entry.getValue();
if (CollectionUtils.isEmpty(failedColumns)) {
continue;
}

if (tbl.isNonNative() && tbl.getStorageHandler().canSetColStatistics(tbl)) {
if (!(tbl.isMaterializedView() || tbl.isView() || tbl.isTemporary())) {
setOrRemoveColumnStatsAccurateProperty(db, tbl, failedColumns, false);
}
} else {
if (colStatDesc.isTblLevel()) {
setOrRemoveColumnStatsAccurateProperty(db, tbl, failedColumns, false);
} else if (!tbl.hasNonNativePartitionSupport()) { // Native HMS partitions only
Map<String, String> partSpec = Warehouse.makeSpecFromName(entry.getKey());
Partition partition = db.getPartition(tbl, partSpec, false);
if (partition == null) {
LOG.debug("Skipping removal of column stats accurate marker for missing partition {}",
entry.getKey());
continue;
}
StatsSetupConst.removeColumnStatsState(partition.getParameters(), failedColumns);
partitionsToUpdate.add(partition);
}
}
}
removePartitionColumnStatsAccurateProperty(db, tbl, partitionsToUpdate);

// Persist the column statistics object to the metastore
// Note, this function is shared for both table and partition column stats.
if (colStats.isEmpty()) {
Expand All @@ -235,7 +280,7 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce
}
}

start = System. currentTimeMillis();
start = System.currentTimeMillis();
if (tbl.isNonNative() && tbl.getStorageHandler().canSetColStatistics(tbl)) {
boolean success = tbl.getStorageHandler().setColStatistics(tbl, colStats);
if (!(tbl.isMaterializedView() || tbl.isView() || tbl.isTemporary())) {
Expand Down Expand Up @@ -268,6 +313,20 @@ private void setOrRemoveColumnStatsAccurateProperty(Hive db, Table tbl, List<Str
db.alterTable(tbl.getFullyQualifiedName(), tbl, environmentContext, false);
}

private void removePartitionColumnStatsAccurateProperty(Hive db, Table tbl, List<Partition> partitions)
throws HiveException {
if (CollectionUtils.isEmpty(partitions)) {
return;
}
EnvironmentContext environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
try {
db.alterPartitions(tbl.getFullyQualifiedName(), partitions, environmentContext, false);
} catch (InvalidOperationException e) {
throw new HiveException(e);
}
}

/**
* Enumeration of column stats fields that can currently
* be computed. Each one has a field name associated.
Expand Down
94 changes: 94 additions & 0 deletions ql/src/test/queries/clientpositive/stats_col_stats_inaccurate.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
set hive.stats.autogather=true;
set hive.stats.column.autogather=true;
set hive.stats.fetch.column.stats=true;

-- Check partitioned tables on float/double columns with Infinity/NaN on inaccurate stats.

create table stats_t1(
c_double double,
c_float float,
c_str string)
partitioned by (p int)
stored as ORC;

insert into table stats_t1 partition(p=1) values
(cast('Infinity' as double), cast('Infinity' as float), 'row1'),
(cast('-Infinity' as double), cast('-Infinity' as float), 'row2'),
(cast('NAN' as double), cast('NaN' as float), 'row3'),
(cast(1234 as double), 123.456, 'row4');

describe formatted stats_t1 partition(p=1) c_double;
describe formatted stats_t1 partition(p=1) c_float;
describe formatted stats_t1 partition(p=1);


-- Check non-partitioned tables on flout/double columns with Infinity/NaN on inaccurate stats.

create table stats_t2(
c_double double,
c_float float,
c_str string)
stored as ORC;

insert into table stats_t2 values
(cast('Infinity' as double), cast('Infinity' as float), 'row1'),
(cast('-Infinity' as double), cast('-Infinity' as float), 'row2'),
(cast('NAN' as double), cast('NaN' as float), 'row3'),
(cast(1234 as double), 123.456, 'row4');
analyze table stats_t2 compute statistics for columns;


describe formatted stats_t2 c_double;
describe formatted stats_t2 c_float;
describe formatted stats_t2;

-- All columns fail with UnsupportedDoubleException
create table stats_t3(
a double,
b float)
partitioned by (p int)
stored as ORC;

insert into table stats_t3 partition(p=1) values
(cast('Infinity' as double), cast('Infinity' as float)),
(cast('-Infinity' as double), cast('-Infinity' as float)),
(cast('NaN' as double), cast('NaN' as float));
describe formatted stats_t3 partition(p=1) a;
describe formatted stats_t3 partition(p=1) b;
describe formatted stats_t3 partition(p=1);

-- Multiple partitions with different columns failed with UnsupportedDoubleException

create table stats_t4(
a double,
b float)
partitioned by (p int)
stored as ORC;

-- a fails with UnsupportedDoubleException, b normal
insert into table stats_t4 partition(p=1) values
(cast('Infinity' as double), 3.14),
(cast('-Infinity' as double), 2.72),
(cast('NaN' as double), 1.618);

-- a normal, b fails with UnsupportedDoubleException
insert into table stats_t4 partition(p=2) values
(42.0, cast('Infinity' as float)),
(17.3, cast('-Infinity' as float)),
(5.4, cast('NaN' as float));

-- both a and b fail with UnsupportedDoubleException
insert into table stats_t4 partition(p=3) values
(cast('Infinity' as double), cast('-Infinity' as float)),
(cast('-Infinity' as double), cast('Infinity' as float)),
(cast('NaN' as double), cast('NaN' as float));

-- both a and b normal
insert into table stats_t4 partition(p=4) values
(1.0, 2.0),
(3.0, 4.0);

describe formatted stats_t4 partition(p=1);
describe formatted stats_t4 partition(p=2);
describe formatted stats_t4 partition(p=3);
describe formatted stats_t4 partition(p=4);
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ Retention: 0
#### A masked pattern was here ####
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"b\":\"true\",\"c1\":\"true\",\"c10\":\"true\",\"c11\":\"true\",\"c12\":\"true\",\"c13\":\"true\",\"c14\":\"true\",\"c15\":\"true\",\"c2\":\"true\",\"c3\":\"true\",\"c4\":\"true\",\"c5\":\"true\",\"c6\":\"true\",\"c7\":\"true\",\"c8\":\"true\",\"c9\":\"true\",\"insert_num\":\"true\"}}
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"b\":\"true\",\"c1\":\"true\",\"c10\":\"true\",\"c12\":\"true\",\"c13\":\"true\",\"c15\":\"true\",\"c3\":\"true\",\"c4\":\"true\",\"c6\":\"true\",\"c7\":\"true\",\"c9\":\"true\",\"insert_num\":\"true\"}}
bucketing_version 2
numFiles 1
numRows 5
Expand Down
Loading
Loading