Skip to content

Commit 99b61cc

Browse files
authored
HIVE-29616: Fix incorrect column lineage when multiple subqueries with identical table aliases (apache#6485)
1 parent 83d98f4 commit 99b61cc

3 files changed

Lines changed: 73 additions & 4 deletions

File tree

ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/ExprProcFactory.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,12 @@ public static SemanticNodeProcessor getColumnProcessor() {
183183
return new ColumnExprProcessor();
184184
}
185185

186-
private static boolean findSourceColumn(
186+
private static boolean findSourceColumn(Operator<? extends OperatorDesc> inpOp,
187187
LineageCtx lctx, Predicate cond, String tabAlias, String alias) {
188188
for (Map.Entry<String, TableScanOperator> topOpMap: lctx.getParseCtx().getTopOps().entrySet()) {
189189
TableScanOperator tableScanOp = topOpMap.getValue();
190190
Table tbl = tableScanOp.getConf().getTableMetadata();
191-
if (tbl.getTableName().equals(tabAlias)
192-
|| tabAlias.equals(tableScanOp.getConf().getAlias())) {
191+
if (isMatchingTableScan(inpOp, tabAlias, tableScanOp, tbl)) {
193192
for (FieldSchema column: tbl.getCols()) {
194193
if (column.getName().equals(alias)) {
195194
TableAliasInfo table = new TableAliasInfo();
@@ -208,6 +207,16 @@ private static boolean findSourceColumn(
208207
return false;
209208
}
210209

210+
private static boolean isMatchingTableScan(Operator<? extends OperatorDesc> inpOp, String tabAlias,
211+
TableScanOperator tableScanOp, Table tbl) {
212+
boolean operatorIdMatches = inpOp.getOperatorId().equals(tableScanOp.getOperatorId());
213+
214+
boolean tableNameMatches = tbl.getTableName().equals(tabAlias);
215+
boolean aliasMatches = tabAlias.equals(tableScanOp.getConf().getAlias());
216+
217+
return operatorIdMatches && (tableNameMatches || aliasMatches);
218+
}
219+
211220
/**
212221
* Get the expression string of an expression node.
213222
*/
@@ -241,7 +250,7 @@ public static String getExprString(RowSchema rs, ExprNodeDesc expr,
241250
}
242251
if (tabAlias != null && tabAlias.length() > 0
243252
&& !tabAlias.startsWith("_") && !tabAlias.startsWith("$")) {
244-
if (cond != null && !findSourceColumn(lctx, cond, tabAlias, alias) && dep != null) {
253+
if (cond != null && !findSourceColumn(inpOp, lctx, cond, tabAlias, alias) && dep != null) {
245254
cond.getBaseCols().addAll(dep.getBaseCols());
246255
}
247256
return tabAlias + "." + alias;
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.LineageLogger;
2+
3+
create table table_1 (id1 int, id2 int);
4+
create table table_2 (id1 int, id2 int);
5+
6+
create table table_3 as
7+
select id1 from table_1 t1 where t1.id2 = 1
8+
union all
9+
select id1 from table_2 t1 where t1.id2 = 2;
10+
11+
create table table_4 as
12+
select id1 from (select id1,id2 from table_1 t1 where t1.id1 = 3 ) t1 where t1.id2 = 1
13+
union all
14+
select id1 from table_2 t1 where t1.id2 = 2;
15+
16+
create table table_5 as
17+
select t.id1 from
18+
(select id1 from table_1 t1 where t1.id2 = 1) t
19+
join table_2 t1 on t.id1 = t1.id2;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
PREHOOK: query: create table table_1 (id1 int, id2 int)
2+
PREHOOK: type: CREATETABLE
3+
PREHOOK: Output: database:default
4+
PREHOOK: Output: default@table_1
5+
PREHOOK: query: create table table_2 (id1 int, id2 int)
6+
PREHOOK: type: CREATETABLE
7+
PREHOOK: Output: database:default
8+
PREHOOK: Output: default@table_2
9+
PREHOOK: query: create table table_3 as
10+
select id1 from table_1 t1 where t1.id2 = 1
11+
union all
12+
select id1 from table_2 t1 where t1.id2 = 2
13+
PREHOOK: type: CREATETABLE_AS_SELECT
14+
PREHOOK: Input: default@table_1
15+
PREHOOK: Input: default@table_2
16+
PREHOOK: Output: database:default
17+
PREHOOK: Output: default@table_3
18+
Result schema has 1 fields, but we don't get as many dependencies
19+
{"version":"1.0","engine":"tez","database":"default","hash":"24a0f860f60a1b7d5f350fd8eb164a37","queryText":"create table table_3 as\nselect id1 from table_1 t1 where t1.id2 = 1\nunion all\nselect id1 from table_2 t1 where t1.id2 = 2","edges":[{"sources":[1,2],"targets":[0],"expression":"id1","edgeType":"PROJECTION"},{"sources":[3],"targets":[0],"expression":"(t1.id2 = 1)","edgeType":"PREDICATE"},{"sources":[4],"targets":[0],"expression":"(t1.id2 = 2)","edgeType":"PREDICATE"}],"vertices":[{"id":0,"vertexType":"COLUMN","vertexId":"default.table_3.id1"},{"id":1,"vertexType":"COLUMN","vertexId":"default.table_1.id1"},{"id":2,"vertexType":"COLUMN","vertexId":"default.table_2.id1"},{"id":3,"vertexType":"COLUMN","vertexId":"default.table_1.id2"},{"id":4,"vertexType":"COLUMN","vertexId":"default.table_2.id2"}]}
20+
PREHOOK: query: create table table_4 as
21+
select id1 from (select id1,id2 from table_1 t1 where t1.id1 = 3 ) t1 where t1.id2 = 1
22+
union all
23+
select id1 from table_2 t1 where t1.id2 = 2
24+
PREHOOK: type: CREATETABLE_AS_SELECT
25+
PREHOOK: Input: default@table_1
26+
PREHOOK: Input: default@table_2
27+
PREHOOK: Output: database:default
28+
PREHOOK: Output: default@table_4
29+
Result schema has 1 fields, but we don't get as many dependencies
30+
{"version":"1.0","engine":"tez","database":"default","hash":"761d0cf34076cec77766bf7af8f1cbe9","queryText":"create table table_4 as\nselect id1 from (select id1,id2 from table_1 t1 where t1.id1 = 3 ) t1 where t1.id2 = 1\nunion all\nselect id1 from table_2 t1 where t1.id2 = 2","edges":[{"sources":[1],"targets":[0],"expression":"id1","edgeType":"PROJECTION"},{"sources":[2,3],"targets":[0],"expression":"((t1.id1 = 3) and (t1.id2 = 1))","edgeType":"PREDICATE"},{"sources":[4],"targets":[0],"expression":"(t1.id2 = 2)","edgeType":"PREDICATE"}],"vertices":[{"id":0,"vertexType":"COLUMN","vertexId":"default.table_4.id1"},{"id":1,"vertexType":"COLUMN","vertexId":"default.table_2.id1"},{"id":2,"vertexType":"COLUMN","vertexId":"default.table_1.id1"},{"id":3,"vertexType":"COLUMN","vertexId":"default.table_1.id2"},{"id":4,"vertexType":"COLUMN","vertexId":"default.table_2.id2"}]}
31+
PREHOOK: query: create table table_5 as
32+
select t.id1 from
33+
(select id1 from table_1 t1 where t1.id2 = 1) t
34+
join table_2 t1 on t.id1 = t1.id2
35+
PREHOOK: type: CREATETABLE_AS_SELECT
36+
PREHOOK: Input: default@table_1
37+
PREHOOK: Input: default@table_2
38+
PREHOOK: Output: database:default
39+
PREHOOK: Output: default@table_5
40+
Result schema has 1 fields, but we don't get as many dependencies
41+
{"version":"1.0","engine":"tez","database":"default","hash":"615bb67f6ff2dd50695bffd14c296677","queryText":"create table table_5 as\nselect t.id1 from\n(select id1 from table_1 t1 where t1.id2 = 1) t\njoin table_2 t1 on t.id1 = t1.id2","edges":[{"sources":[1],"targets":[0],"edgeType":"PROJECTION"},{"sources":[2,1],"targets":[0],"expression":"((t1.id2 = 1) and t1.id1 is not null)","edgeType":"PREDICATE"},{"sources":[1,3],"targets":[0],"expression":"(t1.id1 = t1.id2)","edgeType":"PREDICATE"},{"sources":[3],"targets":[0],"expression":"t1.id2 is not null","edgeType":"PREDICATE"}],"vertices":[{"id":0,"vertexType":"COLUMN","vertexId":"default.table_5.id1"},{"id":1,"vertexType":"COLUMN","vertexId":"default.table_1.id1"},{"id":2,"vertexType":"COLUMN","vertexId":"default.table_1.id2"},{"id":3,"vertexType":"COLUMN","vertexId":"default.table_2.id2"}]}

0 commit comments

Comments
 (0)