From 19e669386a4a5179bbe962355dd99ad2badc5796 Mon Sep 17 00:00:00 2001 From: Hanyu Wei Date: Thu, 4 Jun 2026 10:40:52 -0700 Subject: [PATCH 1/2] feat(ppl): support bare-field join criteria `join on ` shorthand Let `on`/`where` join criteria accept a bare field name (or `AND` of bare fields) as shorthand for the qualified equality on that common field: source=t1 | inner join on a t2 == on t1.a = t2.a source=t1 | inner join on a AND b t2 == on t1.a = t2.a AND t1.b = t2.b source=t1 | join on a t2 == on t1.a = t2.a (no prefix/alias) The shorthand behaves exactly like the explicit `on l.f = r.f` criteria: it KEEPS BOTH key columns (the right key renamed `.f`, or `.f` when no alias is given). This differs from the field-list join (`join f t2`), which merges the duplicate key to a single column. Implementation: - Grammar: reorder the joinCommand alternatives so the criteria alternative is listed first. This lets the no-prefix form `join on a ` parse `on` as the criteria keyword instead of the field-list alternative greedily consuming `on`/`where` as a field name. ANTLR ALL(*) picks the first-listed alternative on genuine ambiguity. - AstBuilder: a bare single-part field (or AND-chain of them) is left as the join condition verbatim -- the unresolved AST reflects what the user wrote. - Planner (CalciteRelNodeVisitor.visitJoin): the criteria branch detects a bare-field condition and builds the equi-join from it, resolving each field by stack position (LEFT.f = RIGHT.f) via the existing buildJoinConditionByFieldName helper. No new planner code, and no AST mutation. Because resolution is positional rather than by qualifier, a self-join `join on f t` is a genuine cross-scan equi-join rather than the `f = f` tautology a name-based rewrite would collapse to. Adds AstBuilder, Calcite plan, anonymizer, and integration tests; updates the join command user manual. Signed-off-by: Hanyu Wei --- .../sql/calcite/CalciteRelNodeVisitor.java | 20 +- .../sql/calcite/utils/JoinAndLookupUtils.java | 22 ++ docs/user/ppl/cmd/join.md | 7 +- .../sql/calcite/remote/CalcitePPLJoinIT.java | 100 ++++++++ ppl/src/main/antlr/OpenSearchPPLParser.g4 | 5 +- .../opensearch/sql/ppl/parser/AstBuilder.java | 2 + .../sql/ppl/calcite/CalcitePPLJoinTest.java | 241 ++++++++++++++++++ .../sql/ppl/parser/AstBuilderTest.java | 87 +++++++ .../ppl/utils/PPLQueryDataAnonymizerTest.java | 22 ++ 9 files changed, 498 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 91a30361a20..68f389363b3 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -1889,10 +1889,22 @@ public RelNode visitJoin(Join node, CalcitePlanContext context) { } } else { // The join-with-criteria grammar doesn't allow empty join condition - RexNode joinCondition = - node.getJoinCondition() - .map(c -> rexVisitor.analyzeJoinCondition(c, context)) - .orElse(context.relBuilder.literal(true)); + RexNode joinCondition; + List bareFields = new ArrayList<>(); + if (JoinAndLookupUtils.collectBareFields(node.getJoinCondition().get(), bareFields)) { + // Bare-field shorthand `on a [AND b ...]`. Resolving by stack position rather than by + // qualifier keeps a self-join `join on f t` a real cross-scan equi-join, not f = f. + joinCondition = + bareFields.stream() + .map(f -> buildJoinConditionByFieldName(context, f)) + .reduce(context.rexBuilder::and) + .orElse(context.relBuilder.literal(true)); + } else { + joinCondition = + node.getJoinCondition() + .map(c -> rexVisitor.analyzeJoinCondition(c, context)) + .orElse(context.relBuilder.literal(true)); + } if (node.getJoinType() == SEMI || node.getJoinType() == ANTI) { // semi and anti join only return left table outputs context.relBuilder.join( diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java index dce882bc151..9ed9d58a5da 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java @@ -13,6 +13,10 @@ import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rex.RexNode; import org.apache.calcite.util.Pair; +import org.opensearch.sql.ast.expression.And; +import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.QualifiedName; +import org.opensearch.sql.ast.expression.UnresolvedExpression; import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.calcite.CalcitePlanContext; @@ -37,6 +41,24 @@ static JoinRelType translateJoinType(Join.JoinType joinType) { } } + /** + * Collects the names of bare single-part-field join criteria (e.g. `on a AND b` -> [a, b]). + * Returns false for anything else (qualified field, comparison, OR); {@code out} is only valid + * when this returns true, so callers must check the return before reading it. + */ + static boolean collectBareFields(UnresolvedExpression expr, List out) { + if (expr instanceof And and) { + return collectBareFields(and.getLeft(), out) && collectBareFields(and.getRight(), out); + } + if (expr instanceof Field field + && field.getField() instanceof QualifiedName qn + && qn.getParts().size() == 1) { + out.add(qn.getParts().get(0)); + return true; + } + return false; + } + /* ------For Lookup------ */ /** diff --git a/docs/user/ppl/cmd/join.md b/docs/user/ppl/cmd/join.md index 548d448b6a0..87c48d09dbe 100644 --- a/docs/user/ppl/cmd/join.md +++ b/docs/user/ppl/cmd/join.md @@ -29,6 +29,9 @@ source = table1 | left anti join left = l right = r on l.a = r.a table2 source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c source = table1 | inner join on a = c table2 | fields a, b, c, d +source = table1 | inner join on a table2 | fields a, table2.a, b, c +source = table1 | inner join on a AND b table2 | fields a, table2.a, b, table2.b +source = table1 | join on a table2 | fields a, table2.a, b, c source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a @@ -40,7 +43,7 @@ The basic `join` syntax supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | -| `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. | +| `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. A bare field name `f` (or `f AND g ...`) is shorthand for an equi-join on the common field(s) and keeps both key columns. | | `` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. | | `joinType` | Optional | The type of join to perform. Valid values are `left`, `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. | | `left` | Optional | An alias for the left dataset (typically a subsearch) used to avoid ambiguous field names. Specify as `left = `. | @@ -71,7 +74,7 @@ The extended `join` syntax supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | -| `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. | +| `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. A bare field name `f` (or `f AND g ...`) is shorthand for an equi-join on the common field(s); it keeps both key columns and differs from ``, which merges duplicates. | | `` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. | | `type` | Optional | The join type when using extended syntax. Valid values are `left`, `outer` (same as `left`), `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. | | `` | Optional | A list of fields used to build the join criteria. These fields must exist in both datasets. If not specified, all fields common to both datasets are used as join keys. | diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java index d6d1c72f992..b6abc8fa541 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java @@ -928,6 +928,106 @@ public void testJoinWhenLegacyNotPreferred() throws IOException { }); } + @Test + public void testJoinWithImplicitField() throws IOException { + // Keep-both, so co-named columns (other than the key) resolve to the left side. + JSONObject actual = + executeQuery( + String.format( + "source=%s | inner join on name %s | fields name, age, state, country, occupation," + + " salary", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION)); + verifySchema( + actual, + schema("name", "string"), + schema("age", "int"), + schema("state", "string"), + schema("country", "string"), + schema("occupation", "string"), + schema("salary", "int")); + verifyDataRows( + actual, + rows("Jake", 70, "California", "USA", "Engineer", 100000), + rows("Hello", 30, "New York", "USA", "Artist", 70000), + rows("John", 25, "Ontario", "Canada", "Doctor", 120000), + rows("Jane", 20, "Quebec", "Canada", "Scientist", 90000), + rows("David", 40, "Washington", "USA", "Doctor", 120000), + rows("David", 40, "Washington", "USA", "Unemployed", 0)); + + // The shorthand and the explicit qualified-equality form are output-identical. + JSONObject explicitForm = + executeQuery( + String.format( + "source=%s | inner join on %s.name = %s.name %s | fields name, age, state, country," + + " occupation, salary", + TEST_INDEX_STATE_COUNTRY, + TEST_INDEX_STATE_COUNTRY, + TEST_INDEX_OCCUPATION, + TEST_INDEX_OCCUPATION)); + assertJsonEquals(explicitForm.toString(), actual.toString()); + } + + @Test + public void testJoinNoPrefixImplicitField() throws IOException { + // No-prefix `join on name` matches the explicit qualified form, not the field-list `join name`. + JSONObject actual = + executeQuery( + String.format( + "source=%s | join on name %s | fields name, age, state, occupation, salary", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION)); + JSONObject explicitForm = + executeQuery( + String.format( + "source=%s | join on %s.name = %s.name %s | fields name, age, state, occupation," + + " salary", + TEST_INDEX_STATE_COUNTRY, + TEST_INDEX_STATE_COUNTRY, + TEST_INDEX_OCCUPATION, + TEST_INDEX_OCCUPATION)); + assertJsonEquals(explicitForm.toString(), actual.toString()); + } + + @Test + public void testLeftJoinWithImplicitField() throws IOException { + // Keep-both does not coalesce, so unmatched-left rows keep the non-null left key, not a NULL. + JSONObject actual = + executeQuery( + String.format( + "source=%s | left join on name %s | fields name, age, state, occupation, salary", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION)); + verifySchema( + actual, + schema("name", "string"), + schema("age", "int"), + schema("state", "string"), + schema("occupation", "string"), + schema("salary", "int")); + verifyDataRows( + actual, + rows("Jake", 70, "California", "Engineer", 100000), + rows("Hello", 30, "New York", "Artist", 70000), + rows("John", 25, "Ontario", "Doctor", 120000), + rows("Jane", 20, "Quebec", "Scientist", 90000), + rows("David", 40, "Washington", "Doctor", 120000), + rows("David", 40, "Washington", "Unemployed", 0), + rows("Jim", 27, "B.C", null, null), + rows("Peter", 57, "B.C", null, null), + rows("Rick", 70, "B.C", null, null)); + } + + @Test + public void testAliasedSelfJoinWithImplicitField() throws IOException { + // Self-join on the unique `name`: a real equi-join matches each of the 8 rows to itself (8 + // rows), not the 64-row cross product a tautology would give. + JSONObject actual = + executeQuery( + String.format( + "source=%s | inner join left=l right=r on name %s | fields name, r.name", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY)); + verifyNumOfRows(actual, 8); + verifySchema(actual, schema("name", "string"), schema("r.name", "string")); + } + @Test public void testJoinComparing() throws IOException { JSONObject actual = diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index ff46a3649a0..98f85b08282 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -729,8 +729,9 @@ sourceFilterArg // join joinCommand - : JOIN (joinOption)* (fieldList)? right = tableOrSubqueryClause - | sqlLikeJoinType? JOIN (joinOption)* sideAlias joinHintList? joinCriteria right = tableOrSubqueryClause + // Criteria alt listed first - so `join on a` reads `on` as the criteria keyword, not a field. + : sqlLikeJoinType? JOIN (joinOption)* sideAlias joinHintList? joinCriteria right = tableOrSubqueryClause + | JOIN (joinOption)* (fieldList)? right = tableOrSubqueryClause ; sqlLikeJoinType diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index d4f5eea0fb7..3741137f5a9 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -326,6 +326,8 @@ public UnresolvedPlan visitJoinCommand(OpenSearchPPLParser.JoinCommandContext ct if (ctx.fieldList() != null) { joinFields = Optional.of(getFieldList(ctx.fieldList())); } + // Keep a bare `on ` criteria verbatim; the planner expands it to an equi-join. Folding + // it into joinFields here would instead merge the key into one column. return new Join( projectExceptMeta(right), leftAlias, diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java index b9c6dd4b093..eea4bd28cb1 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java @@ -1163,4 +1163,245 @@ public void testSqlLikeJoinWithSpecificJoinType() { + "LEFT JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + // A join-type prefix or alias is required for `on`/`where` to read as the criteria keyword + // rather than as a field list. + @Test + public void testJoinWithImplicitField() { + String ppl = "source=EMP | inner join on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `DEPT`.`DEPTNO` `DEPT.DEPTNO`," + + " `DEPT`.`DNAME`, `DEPT`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "INNER JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testJoinWithImplicitFieldAlias() { + String ppl = "source=EMP | join left=l right=r on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], r.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `DEPT`.`DEPTNO` `r.DEPTNO`," + + " `DEPT`.`DNAME`, `DEPT`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "INNER JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testJoinWithImplicitFieldWhereKeyword() { + String ppl = "source=EMP | inner join where DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `DEPT`.`DEPTNO` `DEPT.DEPTNO`," + + " `DEPT`.`DNAME`, `DEPT`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "INNER JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testLeftJoinWithImplicitField() { + String ppl = "source=EMP | left join on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[left])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `DEPT`.`DEPTNO` `DEPT.DEPTNO`," + + " `DEPT`.`DNAME`, `DEPT`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "LEFT JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testSemiJoinWithImplicitField() { + String ppl = "source=EMP | semi join on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalJoin(condition=[=($7, $8)], joinType=[semi])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT *\n" + + "FROM `scott`.`EMP`\n" + + "WHERE EXISTS (SELECT 1\n" + + "FROM `scott`.`DEPT`\n" + + "WHERE `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`)"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testJoinWithImplicitFieldMultiField() { + String ppl = "source=EMP | inner join on DEPTNO AND SAL EMP"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], EMP.EMPNO=[$8], EMP.ENAME=[$9], EMP.JOB=[$10]," + + " EMP.MGR=[$11], EMP.HIREDATE=[$12], EMP.SAL=[$13], EMP.COMM=[$14]," + + " EMP.DEPTNO=[$15])\n" + + " LogicalJoin(condition=[AND(=($7, $15), =($5, $13))], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + } + + // Self-join joins the two scans by position, so the key is a real equi-join across them, not the + // EMP.DEPTNO=EMP.DEPTNO tautology a qualifier rewrite would collapse to. + @Test + public void testJoinWithImplicitFieldSelfJoin() { + String ppl = "source=EMP | inner join on DEPTNO EMP"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], EMP.EMPNO=[$8], EMP.ENAME=[$9], EMP.JOB=[$10]," + + " EMP.MGR=[$11], EMP.HIREDATE=[$12], EMP.SAL=[$13], EMP.COMM=[$14]," + + " EMP.DEPTNO=[$15])\n" + + " LogicalJoin(condition=[=($7, $15)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + } + + // ENAME is not in DEPT, so the right-side lookup fails. Match the field name only; Calcite's + // full message is version-dependent. + @Test + public void testJoinWithImplicitFieldNotOnBothSides() { + String ppl = "source=EMP | inner join on ENAME DEPT"; + Throwable t = Assert.assertThrows(RuntimeException.class, () -> getRelNode(ppl)); + verifyErrorMessageContains(t, "ENAME"); + } + + @Test + public void testJoinNoPrefixImplicitField() { + String ppl = "source=EMP | join on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `DEPT`.`DEPTNO` `DEPT.DEPTNO`," + + " `DEPT`.`DNAME`, `DEPT`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "INNER JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testJoinNoPrefixImplicitFieldWhereKeyword() { + String ppl = "source=EMP | join where DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `DEPT`.`DEPTNO` `DEPT.DEPTNO`," + + " `DEPT`.`DNAME`, `DEPT`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "INNER JOIN `scott`.`DEPT` ON `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + // Right side is a subquery, not a table: the bare field still resolves (no underlying table name + // is needed), and the right key takes the subquery alias. + @Test + public void testJoinWithImplicitFieldAliasedSubquery() { + String ppl = "source=EMP | inner join on DEPTNO [ source=DEPT ] as d"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], d.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + } + + // Unaliased subquery: resolves by position with no alias and no reachable table name. + @Test + public void testJoinWithImplicitFieldUnaliasedSubquery() { + String ppl = "source=EMP | inner join on DEPTNO [ source=DEPT ]"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + } + + // Aliases make the self-join meaningful: the right columns surface under `r`. + @Test + public void testJoinWithImplicitFieldAliasedSelfJoin() { + String ppl = "source=EMP | inner join left=l right=r on DEPTNO EMP"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], r.EMPNO=[$8], r.ENAME=[$9], r.JOB=[$10], r.MGR=[$11]," + + " r.HIREDATE=[$12], r.SAL=[$13], r.COMM=[$14], r.DEPTNO=[$15])\n" + + " LogicalJoin(condition=[=($7, $15)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index c5b033bfc9f..9d70487c741 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -13,6 +13,7 @@ import static org.opensearch.sql.ast.dsl.AstDSL.agg; import static org.opensearch.sql.ast.dsl.AstDSL.aggregate; import static org.opensearch.sql.ast.dsl.AstDSL.alias; +import static org.opensearch.sql.ast.dsl.AstDSL.and; import static org.opensearch.sql.ast.dsl.AstDSL.appendPipe; import static org.opensearch.sql.ast.dsl.AstDSL.argument; import static org.opensearch.sql.ast.dsl.AstDSL.booleanLiteral; @@ -77,6 +78,7 @@ import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.Chart; import org.opensearch.sql.ast.tree.GraphLookup; +import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Kmeans; import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.RareTopN.CommandType; @@ -1852,4 +1854,89 @@ public void testUnionWithMaxoutOption() { public void testMaxoutAsFieldName() { plan("source=t | eval maxout = 1"); } + + // These assert on joinCondition/joinFields rather than full Join equality, since JoinHint has no + // equals(). + @Test + public void testJoinWithImplicitFieldKeepsBareField() { + Join shorthand = (Join) plan("source=t1 | inner join on a AND b t2"); + assertTrue(shorthand.getJoinFields().isEmpty()); + assertEquals(Optional.of(and(field("a"), field("b"))), shorthand.getJoinCondition()); + } + + @Test + public void testJoinWithSingleImplicitFieldKeepsBareField() { + Join shorthand = (Join) plan("source=t1 | inner join on a t2"); + assertTrue(shorthand.getJoinFields().isEmpty()); + assertEquals(Optional.of(field("a")), shorthand.getJoinCondition()); + } + + @Test + public void testJoinWithMultipleImplicitFieldsFlattenInOrder() { + Join join = (Join) plan("source=t1 | inner join on a AND b AND c t2"); + assertTrue(join.getJoinFields().isEmpty()); + assertEquals( + Optional.of(and(and(field("a"), field("b")), field("c"))), join.getJoinCondition()); + } + + @Test + public void testJoinWithImplicitFieldWhereKeywordKeepsBareField() { + Join join = (Join) plan("source=t1 | inner join where a t2"); + assertTrue(join.getJoinFields().isEmpty()); + assertEquals(Optional.of(field("a")), join.getJoinCondition()); + } + + @Test + public void testJoinWithQualifiedConditionIsNotRewritten() { + Join join = (Join) plan("source=t1 | inner join on l.a = r.a t2"); + assertEquals( + Optional.of(compare("=", field(qualifiedName("l", "a")), field(qualifiedName("r", "a")))), + join.getJoinCondition()); + assertTrue(join.getJoinFields().isEmpty()); + } + + // No-prefix form: the grammar reorder lets `on` read as the criteria keyword, not a field. + @Test + public void testJoinNoPrefixImplicitFieldKeepsBareField() { + Join join = (Join) plan("source=t1 | join on a t2"); + assertTrue(join.getJoinFields().isEmpty()); + assertEquals(Optional.of(field("a")), join.getJoinCondition()); + } + + @Test + public void testJoinNoPrefixImplicitFieldWhereKeepsBareField() { + Join join = (Join) plan("source=t1 | join where a t2"); + assertTrue(join.getJoinFields().isEmpty()); + assertEquals(Optional.of(field("a")), join.getJoinCondition()); + } + + @Test + public void testJoinWithImplicitFieldKeepsAliasesOnNode() { + Join join = (Join) plan("source=t1 | join left = l right = r on a t2"); + assertTrue(join.getJoinFields().isEmpty()); + assertEquals(Optional.of(field("a")), join.getJoinCondition()); + assertEquals(Optional.of("l"), join.getLeftAlias()); + assertEquals(Optional.of("r"), join.getRightAlias()); + } + + // The reorder must not change the field-list form. + @Test + public void testJoinFieldListStillParsesAsFieldList() { + Join join = (Join) plan("source=t1 | join a t2"); + assertEquals(Optional.empty(), join.getJoinCondition()); + assertEquals(Optional.of(List.of(field("a"))), join.getJoinFields()); + } + + @Test + public void testJoinNoPrefixComparisonStaysCondition() { + Join join = (Join) plan("source=t1 | join on a = b t2"); + assertTrue(join.getJoinCondition().isPresent()); + assertTrue(join.getJoinFields().isEmpty()); + } + + // A prefix with a bare field but no on/where is still a field list, so this is a syntax error. + @Test + public void testJoinPrefixWithoutCriteriaKeywordIsSyntaxError() { + assertThrows(SyntaxCheckException.class, () -> plan("source=t1 | inner join a t2")); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 585575b2b24..6756cdc198a 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -619,6 +619,28 @@ public void testJoinWithFieldList() { anonymize("source=t | join type=outer max=2 id1 id2 s | fields id1")); } + @Test + public void testJoinWithImplicitField() { + // The AST keeps the bare field, so the anonymized query shows `on identifier`, not a rewrite. + assertEquals( + "source=table | inner join max=*** on identifier table | fields + identifier", + anonymize("source=t | inner join on id s | fields id")); + assertEquals( + "source=table as identifier | inner join max=*** left = identifier right = identifier on" + + " identifier table as identifier | fields + identifier", + anonymize("source=t | join left = l right = r on id s | fields id")); + assertEquals( + "source=table | inner join max=*** on identifier and identifier table | fields +" + + " identifier", + anonymize("source=t | inner join on id AND uid s | fields id")); + assertEquals( + "source=table | inner join max=*** on identifier table | fields + identifier", + anonymize("source=t | inner join where id s | fields id")); + assertEquals( + "source=table | inner join max=*** on identifier table | fields + identifier", + anonymize("source=t | join on id s | fields id")); + } + @Test public void testLookup() { assertEquals( From af50442b3758bb41005c9d76b9a7dbd26f5ab44d Mon Sep 17 00:00:00 2001 From: Hanyu Wei Date: Wed, 10 Jun 2026 09:13:59 -0700 Subject: [PATCH 2/2] [Enhancement] Address round-2 review feedback for join shorthand Follow-ups from the second review round on #5517: - collectBareFields: replace the boolean-return + out-param pattern with an Optional> return (RyanL1997). The name list is now assembled only when every node in the AND-tree is a bare field, so the "valid only when true" partial-state footgun is gone. Update the visitJoin caller. - Add JoinAndLookupUtilsTest covering single field, AND-chain, qualified field, mixed condition, and the no-partial-state case. - Add testJoinMixedConditionFallsThrough / testJoinMixedConditionWithInequality confirming a mixed condition (e.g. `on a AND b.x = c.y`) is not detected as bare and falls through to analyzeJoinCondition unchanged (RyanL1997). - Add testJoinWithImplicitFieldMaxGreaterThanZero and testAntiJoinWithImplicitField for the max=N dedup wrapper and the anti-join early-return branch. - CalcitePPLJoinIT.testJoinWithImplicitField: assert the sorted result with verifyDataRowsInOrder so the `| sort name, occupation` is actually validated instead of being silently order-independent. - Tighten the not-on-both-sides assertion to IllegalArgumentException with the field-not-found substring; correct the comment to note the phrase is Calcite's own wording and may shift on a Calcite upgrade. Signed-off-by: Hanyu Wei --- .../sql/calcite/CalciteRelNodeVisitor.java | 7 +- .../sql/calcite/utils/JoinAndLookupUtils.java | 27 +++-- .../calcite/utils/JoinAndLookupUtilsTest.java | 72 ++++++++++++ .../sql/calcite/remote/CalcitePPLJoinIT.java | 25 +++-- .../sql/ppl/calcite/CalcitePPLJoinTest.java | 104 ++++++++++++++++-- 5 files changed, 203 insertions(+), 32 deletions(-) create mode 100644 core/src/test/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtilsTest.java diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 68f389363b3..0e687527f58 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -1890,12 +1890,13 @@ public RelNode visitJoin(Join node, CalcitePlanContext context) { } else { // The join-with-criteria grammar doesn't allow empty join condition RexNode joinCondition; - List bareFields = new ArrayList<>(); - if (JoinAndLookupUtils.collectBareFields(node.getJoinCondition().get(), bareFields)) { + Optional> bareFields = + JoinAndLookupUtils.collectBareFields(node.getJoinCondition().get()); + if (bareFields.isPresent()) { // Bare-field shorthand `on a [AND b ...]`. Resolving by stack position rather than by // qualifier keeps a self-join `join on f t` a real cross-scan equi-join, not f = f. joinCondition = - bareFields.stream() + bareFields.get().stream() .map(f -> buildJoinConditionByFieldName(context, f)) .reduce(context.rexBuilder::and) .orElse(context.relBuilder.literal(true)); diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java index 9ed9d58a5da..7094853b041 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java @@ -9,6 +9,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rex.RexNode; @@ -42,21 +43,29 @@ static JoinRelType translateJoinType(Join.JoinType joinType) { } /** - * Collects the names of bare single-part-field join criteria (e.g. `on a AND b` -> [a, b]). - * Returns false for anything else (qualified field, comparison, OR); {@code out} is only valid - * when this returns true, so callers must check the return before reading it. + * Collects the names of bare single-part-field join criteria (e.g. {@code on a AND b} -> {@code + * Optional.of(["a","b"])}). Returns empty for anything else (qualified field, comparison, OR). + * The list is only constructed when every node in the AND-tree is a bare field. */ - static boolean collectBareFields(UnresolvedExpression expr, List out) { + static Optional> collectBareFields(UnresolvedExpression expr) { if (expr instanceof And and) { - return collectBareFields(and.getLeft(), out) && collectBareFields(and.getRight(), out); + return collectBareFields(and.getLeft()) + .flatMap( + left -> + collectBareFields(and.getRight()) + .map( + right -> { + List merged = new ArrayList<>(left); + merged.addAll(right); + return merged; + })); } if (expr instanceof Field field && field.getField() instanceof QualifiedName qn - && qn.getParts().size() == 1) { - out.add(qn.getParts().get(0)); - return true; + && qn.getPrefix().isEmpty()) { + return Optional.of(new ArrayList<>(List.of(qn.getSuffix()))); } - return false; + return Optional.empty(); } /* ------For Lookup------ */ diff --git a/core/src/test/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtilsTest.java b/core/src/test/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtilsTest.java new file mode 100644 index 00000000000..f8b6739066c --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtilsTest.java @@ -0,0 +1,72 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import java.util.Optional; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.ast.expression.And; +import org.opensearch.sql.ast.expression.Compare; +import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.QualifiedName; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +public class JoinAndLookupUtilsTest { + + @Test + public void collectBareFields_singleField() { + UnresolvedExpression expr = bareField("a"); + Optional> result = JoinAndLookupUtils.collectBareFields(expr); + assertTrue(result.isPresent()); + assertEquals(List.of("a"), result.get()); + } + + @Test + public void collectBareFields_multipleFieldsAndChain() { + UnresolvedExpression expr = new And(bareField("a"), new And(bareField("b"), bareField("c"))); + Optional> result = JoinAndLookupUtils.collectBareFields(expr); + assertTrue(result.isPresent()); + assertEquals(List.of("a", "b", "c"), result.get()); + } + + @Test + public void collectBareFields_mixedConditionReturnsEmpty() { + // `a AND l.x = r.x` — the Compare node makes this not all-bare-fields + UnresolvedExpression compare = + new Compare("=", qualifiedField("l", "x"), qualifiedField("r", "x")); + UnresolvedExpression expr = new And(bareField("a"), compare); + Optional> result = JoinAndLookupUtils.collectBareFields(expr); + assertTrue(result.isEmpty()); + } + + @Test + public void collectBareFields_qualifiedFieldReturnsEmpty() { + // A two-part name like `t.x` is not a bare field + UnresolvedExpression expr = qualifiedField("t", "x"); + Optional> result = JoinAndLookupUtils.collectBareFields(expr); + assertTrue(result.isEmpty()); + } + + @Test + public void collectBareFields_mixedLeftSideReturnsEmptyNoPartialState() { + // Compare on left, bare field on right — must return empty (not ["b"]) + UnresolvedExpression compare = new Compare("=", bareField("x"), bareField("y")); + UnresolvedExpression expr = new And(compare, bareField("b")); + Optional> result = JoinAndLookupUtils.collectBareFields(expr); + assertTrue(result.isEmpty()); + } + + private static Field bareField(String name) { + return new Field(QualifiedName.of(name)); + } + + private static Field qualifiedField(String qualifier, String name) { + return new Field(QualifiedName.of(qualifier, name)); + } +} diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java index b6abc8fa541..9aa9b022681 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java @@ -935,7 +935,7 @@ public void testJoinWithImplicitField() throws IOException { executeQuery( String.format( "source=%s | inner join on name %s | fields name, age, state, country, occupation," - + " salary", + + " salary | sort name, occupation", TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION)); verifySchema( actual, @@ -945,21 +945,23 @@ public void testJoinWithImplicitField() throws IOException { schema("country", "string"), schema("occupation", "string"), schema("salary", "int")); - verifyDataRows( + // Rows are listed in the `sort name, occupation` order so verifyDataRowsInOrder also + // validates the sort (David/Doctor < David/Unemployed, then Hello, Jake, Jane, John). + verifyDataRowsInOrder( actual, - rows("Jake", 70, "California", "USA", "Engineer", 100000), + rows("David", 40, "Washington", "USA", "Doctor", 120000), + rows("David", 40, "Washington", "USA", "Unemployed", 0), rows("Hello", 30, "New York", "USA", "Artist", 70000), - rows("John", 25, "Ontario", "Canada", "Doctor", 120000), + rows("Jake", 70, "California", "USA", "Engineer", 100000), rows("Jane", 20, "Quebec", "Canada", "Scientist", 90000), - rows("David", 40, "Washington", "USA", "Doctor", 120000), - rows("David", 40, "Washington", "USA", "Unemployed", 0)); + rows("John", 25, "Ontario", "Canada", "Doctor", 120000)); // The shorthand and the explicit qualified-equality form are output-identical. JSONObject explicitForm = executeQuery( String.format( "source=%s | inner join on %s.name = %s.name %s | fields name, age, state, country," - + " occupation, salary", + + " occupation, salary | sort name, occupation", TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION, @@ -973,13 +975,14 @@ public void testJoinNoPrefixImplicitField() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | join on name %s | fields name, age, state, occupation, salary", + "source=%s | join on name %s | fields name, age, state, occupation, salary | sort" + + " name, occupation", TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION)); JSONObject explicitForm = executeQuery( String.format( "source=%s | join on %s.name = %s.name %s | fields name, age, state, occupation," - + " salary", + + " salary | sort name, occupation", TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION, @@ -1017,8 +1020,8 @@ public void testLeftJoinWithImplicitField() throws IOException { @Test public void testAliasedSelfJoinWithImplicitField() throws IOException { - // Self-join on the unique `name`: a real equi-join matches each of the 8 rows to itself (8 - // rows), not the 64-row cross product a tautology would give. + // Self-join on the unique `name`: a real equi-join yields 8 rows, not the 64-row cross product + // a tautology would give. JSONObject actual = executeQuery( String.format( diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java index eea4bd28cb1..415acc5558b 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLJoinTest.java @@ -1049,6 +1049,38 @@ public void testJoinWithCriteriaMaxGreaterThanZero() { verifyPPLToSparkSQL(root, expectedSparkSql); } + @Test + public void testJoinWithImplicitFieldMaxGreaterThanZero() { + // Bare-field shorthand combined with a max=N dedup wrapper. Keep-both renames the + // unaliased right-side key to `DEPT.DEPTNO`, and the dedup subquery wraps the right input. + String ppl = "source=EMP | inner join max=1 on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], DEPT.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[=($7, $8)], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalProject(DEPTNO=[$0], DNAME=[$1], LOC=[$2])\n" + + " LogicalFilter(condition=[<=($3, 1)])\n" + + " LogicalProject(DEPTNO=[$0], DNAME=[$1], LOC=[$2]," + + " _row_number_dedup_=[ROW_NUMBER() OVER (PARTITION BY $0)])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 14); + + String expectedSparkSql = + "SELECT `EMP`.`EMPNO`, `EMP`.`ENAME`, `EMP`.`JOB`, `EMP`.`MGR`, `EMP`.`HIREDATE`," + + " `EMP`.`SAL`, `EMP`.`COMM`, `EMP`.`DEPTNO`, `t1`.`DEPTNO` `DEPT.DEPTNO`," + + " `t1`.`DNAME`, `t1`.`LOC`\n" + + "FROM `scott`.`EMP`\n" + + "INNER JOIN (SELECT `DEPTNO`, `DNAME`, `LOC`\n" + + "FROM (SELECT `DEPTNO`, `DNAME`, `LOC`, ROW_NUMBER() OVER (PARTITION BY `DEPTNO`)" + + " `_row_number_dedup_`\n" + + "FROM `scott`.`DEPT`) `t`\n" + + "WHERE `_row_number_dedup_` <= 1) `t1` ON `EMP`.`DEPTNO` = `t1`.`DEPTNO`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + @Test public void testJoinWithMaxEqualsZero() { String ppl = "source=EMP | join type=outer max=0 DEPTNO DEPT"; @@ -1274,6 +1306,28 @@ public void testSemiJoinWithImplicitField() { verifyPPLToSparkSQL(root, expectedSparkSql); } + @Test + public void testAntiJoinWithImplicitField() { + // Anti shares semi's early-return left-only branch (no keep-both project wrapper). Every one + // of the 14 EMP rows matches a DEPT row, so the anti join returns none. + String ppl = "source=EMP | anti join on DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalJoin(condition=[=($7, $8)], joinType=[anti])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + verifyResultCount(root, 0); + + String expectedSparkSql = + "SELECT *\n" + + "FROM `scott`.`EMP`\n" + + "WHERE NOT EXISTS (SELECT 1\n" + + "FROM `scott`.`DEPT`\n" + + "WHERE `EMP`.`DEPTNO` = `DEPT`.`DEPTNO`)"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + @Test public void testJoinWithImplicitFieldMultiField() { String ppl = "source=EMP | inner join on DEPTNO AND SAL EMP"; @@ -1289,8 +1343,8 @@ public void testJoinWithImplicitFieldMultiField() { verifyLogical(root, expectedLogical); } - // Self-join joins the two scans by position, so the key is a real equi-join across them, not the - // EMP.DEPTNO=EMP.DEPTNO tautology a qualifier rewrite would collapse to. + // Self-join resolves by position, so the key is a real equi-join across the two scans, not an + // EMP.DEPTNO=EMP.DEPTNO tautology. @Test public void testJoinWithImplicitFieldSelfJoin() { String ppl = "source=EMP | inner join on DEPTNO EMP"; @@ -1306,13 +1360,15 @@ public void testJoinWithImplicitFieldSelfJoin() { verifyLogical(root, expectedLogical); } - // ENAME is not in DEPT, so the right-side lookup fails. Match the field name only; Calcite's - // full message is version-dependent. + // ENAME is not in DEPT, so the bare field resolves positionally against the right input and + // Calcite throws `field [ENAME] not found` (an IllegalArgumentException, not a semantic check). + // We assert that exact phrase; it is Calcite's own RelBuilder.field wording, so a Calcite + // upgrade that rewords the message would require updating this substring. @Test public void testJoinWithImplicitFieldNotOnBothSides() { String ppl = "source=EMP | inner join on ENAME DEPT"; - Throwable t = Assert.assertThrows(RuntimeException.class, () -> getRelNode(ppl)); - verifyErrorMessageContains(t, "ENAME"); + Throwable t = Assert.assertThrows(IllegalArgumentException.class, () -> getRelNode(ppl)); + verifyErrorMessageContains(t, "field [ENAME] not found"); } @Test @@ -1359,8 +1415,8 @@ public void testJoinNoPrefixImplicitFieldWhereKeyword() { verifyPPLToSparkSQL(root, expectedSparkSql); } - // Right side is a subquery, not a table: the bare field still resolves (no underlying table name - // is needed), and the right key takes the subquery alias. + // Right side is an aliased subquery: positional resolution works, and the right key surfaces + // under the subquery alias. @Test public void testJoinWithImplicitFieldAliasedSubquery() { String ppl = "source=EMP | inner join on DEPTNO [ source=DEPT ] as d"; @@ -1375,7 +1431,7 @@ public void testJoinWithImplicitFieldAliasedSubquery() { verifyResultCount(root, 14); } - // Unaliased subquery: resolves by position with no alias and no reachable table name. + // Unaliased subquery: positional resolution needs no alias, so this still works. @Test public void testJoinWithImplicitFieldUnaliasedSubquery() { String ppl = "source=EMP | inner join on DEPTNO [ source=DEPT ]"; @@ -1404,4 +1460,34 @@ public void testJoinWithImplicitFieldAliasedSelfJoin() { + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); } + + @Test + public void testJoinMixedConditionFallsThrough() { + // A comparison in the AND-chain isn't a bare field, so the whole condition is analyzed + // normally. + String ppl = "source=EMP | join left=l right=r on SAL > 1000 AND l.DEPTNO = r.DEPTNO DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], r.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[AND(>($5, 1000), =($7, $8))], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + } + + @Test + public void testJoinMixedConditionWithInequality() { + // Qualified equality plus an inequality: no bare fields, so it's analyzed as a normal + // condition. + String ppl = "source=EMP | join left=l right=r on l.DEPTNO = r.DEPTNO AND l.SAL > 1000 DEPT"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], r.DEPTNO=[$8], DNAME=[$9], LOC=[$10])\n" + + " LogicalJoin(condition=[AND(=($7, $8), >($5, 1000))], joinType=[inner])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalTableScan(table=[[scott, DEPT]])\n"; + verifyLogical(root, expectedLogical); + } }