Skip to content

Commit d08df37

Browse files
Hanyu-WHanyu Wei
andauthored
[Enhancement] Support bare-field join criteria join on <field> shorthand (opensearch-project#5517)
Co-authored-by: Hanyu Wei <weihanyu@amazon.com> Signed-off-by: Hanyu Wei <weihanyu@amazon.com>
1 parent 7a38664 commit d08df37

10 files changed

Lines changed: 669 additions & 8 deletions

File tree

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1905,10 +1905,23 @@ public RelNode visitJoin(Join node, CalcitePlanContext context) {
19051905
}
19061906
} else {
19071907
// The join-with-criteria grammar doesn't allow empty join condition
1908-
RexNode joinCondition =
1909-
node.getJoinCondition()
1910-
.map(c -> rexVisitor.analyzeJoinCondition(c, context))
1911-
.orElse(context.relBuilder.literal(true));
1908+
RexNode joinCondition;
1909+
Optional<List<String>> bareFields =
1910+
JoinAndLookupUtils.collectBareFields(node.getJoinCondition().get());
1911+
if (bareFields.isPresent()) {
1912+
// Bare-field shorthand `on a [AND b ...]`. Resolving by stack position rather than by
1913+
// qualifier keeps a self-join `join on f t` a real cross-scan equi-join, not f = f.
1914+
joinCondition =
1915+
bareFields.get().stream()
1916+
.map(f -> buildJoinConditionByFieldName(context, f))
1917+
.reduce(context.rexBuilder::and)
1918+
.orElse(context.relBuilder.literal(true));
1919+
} else {
1920+
joinCondition =
1921+
node.getJoinCondition()
1922+
.map(c -> rexVisitor.analyzeJoinCondition(c, context))
1923+
.orElse(context.relBuilder.literal(true));
1924+
}
19121925
if (node.getJoinType() == SEMI || node.getJoinType() == ANTI) {
19131926
// semi and anti join only return left table outputs
19141927
context.relBuilder.join(

core/src/main/java/org/opensearch/sql/calcite/utils/JoinAndLookupUtils.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,15 @@
99
import java.util.HashSet;
1010
import java.util.List;
1111
import java.util.Map;
12+
import java.util.Optional;
1213
import java.util.stream.Collectors;
1314
import org.apache.calcite.rel.core.JoinRelType;
1415
import org.apache.calcite.rex.RexNode;
1516
import org.apache.calcite.util.Pair;
17+
import org.opensearch.sql.ast.expression.And;
18+
import org.opensearch.sql.ast.expression.Field;
19+
import org.opensearch.sql.ast.expression.QualifiedName;
20+
import org.opensearch.sql.ast.expression.UnresolvedExpression;
1621
import org.opensearch.sql.ast.tree.Join;
1722
import org.opensearch.sql.ast.tree.Lookup;
1823
import org.opensearch.sql.calcite.CalcitePlanContext;
@@ -37,6 +42,32 @@ static JoinRelType translateJoinType(Join.JoinType joinType) {
3742
}
3843
}
3944

45+
/**
46+
* Collects the names of bare single-part-field join criteria (e.g. {@code on a AND b} -> {@code
47+
* Optional.of(["a","b"])}). Returns empty for anything else (qualified field, comparison, OR).
48+
* The list is only constructed when every node in the AND-tree is a bare field.
49+
*/
50+
static Optional<List<String>> collectBareFields(UnresolvedExpression expr) {
51+
if (expr instanceof And and) {
52+
return collectBareFields(and.getLeft())
53+
.flatMap(
54+
left ->
55+
collectBareFields(and.getRight())
56+
.map(
57+
right -> {
58+
List<String> merged = new ArrayList<>(left);
59+
merged.addAll(right);
60+
return merged;
61+
}));
62+
}
63+
if (expr instanceof Field field
64+
&& field.getField() instanceof QualifiedName qn
65+
&& qn.getPrefix().isEmpty()) {
66+
return Optional.of(new ArrayList<>(List.of(qn.getSuffix())));
67+
}
68+
return Optional.empty();
69+
}
70+
4071
/* ------For Lookup------ */
4172

4273
/**
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.calcite.utils;
7+
8+
import static org.junit.jupiter.api.Assertions.assertEquals;
9+
import static org.junit.jupiter.api.Assertions.assertTrue;
10+
11+
import java.util.List;
12+
import java.util.Optional;
13+
import org.junit.jupiter.api.Test;
14+
import org.opensearch.sql.ast.expression.And;
15+
import org.opensearch.sql.ast.expression.Compare;
16+
import org.opensearch.sql.ast.expression.Field;
17+
import org.opensearch.sql.ast.expression.QualifiedName;
18+
import org.opensearch.sql.ast.expression.UnresolvedExpression;
19+
20+
public class JoinAndLookupUtilsTest {
21+
22+
@Test
23+
public void collectBareFields_singleField() {
24+
UnresolvedExpression expr = bareField("a");
25+
Optional<List<String>> result = JoinAndLookupUtils.collectBareFields(expr);
26+
assertTrue(result.isPresent());
27+
assertEquals(List.of("a"), result.get());
28+
}
29+
30+
@Test
31+
public void collectBareFields_multipleFieldsAndChain() {
32+
UnresolvedExpression expr = new And(bareField("a"), new And(bareField("b"), bareField("c")));
33+
Optional<List<String>> result = JoinAndLookupUtils.collectBareFields(expr);
34+
assertTrue(result.isPresent());
35+
assertEquals(List.of("a", "b", "c"), result.get());
36+
}
37+
38+
@Test
39+
public void collectBareFields_mixedConditionReturnsEmpty() {
40+
// `a AND l.x = r.x` — the Compare node makes this not all-bare-fields
41+
UnresolvedExpression compare =
42+
new Compare("=", qualifiedField("l", "x"), qualifiedField("r", "x"));
43+
UnresolvedExpression expr = new And(bareField("a"), compare);
44+
Optional<List<String>> result = JoinAndLookupUtils.collectBareFields(expr);
45+
assertTrue(result.isEmpty());
46+
}
47+
48+
@Test
49+
public void collectBareFields_qualifiedFieldReturnsEmpty() {
50+
// A two-part name like `t.x` is not a bare field
51+
UnresolvedExpression expr = qualifiedField("t", "x");
52+
Optional<List<String>> result = JoinAndLookupUtils.collectBareFields(expr);
53+
assertTrue(result.isEmpty());
54+
}
55+
56+
@Test
57+
public void collectBareFields_mixedLeftSideReturnsEmptyNoPartialState() {
58+
// Compare on left, bare field on right — must return empty (not ["b"])
59+
UnresolvedExpression compare = new Compare("=", bareField("x"), bareField("y"));
60+
UnresolvedExpression expr = new And(compare, bareField("b"));
61+
Optional<List<String>> result = JoinAndLookupUtils.collectBareFields(expr);
62+
assertTrue(result.isEmpty());
63+
}
64+
65+
private static Field bareField(String name) {
66+
return new Field(QualifiedName.of(name));
67+
}
68+
69+
private static Field qualifiedField(String qualifier, String name) {
70+
return new Field(QualifiedName.of(qualifier, name));
71+
}
72+
}

docs/user/ppl/cmd/join.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ source = table1 | left anti join left = l right = r on l.a = r.a table2
2929
source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ]
3030
source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c
3131
source = table1 | inner join on a = c table2 | fields a, b, c, d
32+
source = table1 | inner join on a table2 | fields a, table2.a, b, c
33+
source = table1 | inner join on a AND b table2 | fields a, table2.a, b, table2.b
34+
source = table1 | join on a table2 | fields a, table2.a, b, c
3235
source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a
3336
source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a
3437
source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a
@@ -40,7 +43,7 @@ The basic `join` syntax supports the following parameters.
4043

4144
| Parameter | Required/Optional | Description |
4245
| --- | --- | --- |
43-
| `<joinCriteria>` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. |
46+
| `<joinCriteria>` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. A bare field name `f` (or `f AND g ...`) is shorthand for an equi-join on the common field(s) and keeps both key columns. |
4447
| `<right-dataset>` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. |
4548
| `joinType` | Optional | The type of join to perform. Valid values are `left`, `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. |
4649
| `left` | Optional | An alias for the left dataset (typically a subsearch) used to avoid ambiguous field names. Specify as `left = <leftAlias>`. |
@@ -71,7 +74,7 @@ The extended `join` syntax supports the following parameters.
7174

7275
| Parameter | Required/Optional | Description |
7376
| --- | --- | --- |
74-
| `<joinCriteria>` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. |
77+
| `<joinCriteria>` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. A bare field name `f` (or `f AND g ...`) is shorthand for an equi-join on the common field(s); it keeps both key columns and differs from `<join-field-list>`, which merges duplicates. |
7578
| `<right-dataset>` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. |
7679
| `type` | Optional | The join type when using extended syntax. Valid values are `left`, `outer` (same as `left`), `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. |
7780
| `<join-field-list>` | Optional | A list of fields used to build the join criteria. These fields must exist in both datasets. If not specified, all fields common to both datasets are used as join keys. |

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,109 @@ public void testJoinWhenLegacyNotPreferred() throws IOException {
928928
});
929929
}
930930

931+
@Test
932+
public void testJoinWithImplicitField() throws IOException {
933+
// Keep-both, so co-named columns (other than the key) resolve to the left side.
934+
JSONObject actual =
935+
executeQuery(
936+
String.format(
937+
"source=%s | inner join on name %s | fields name, age, state, country, occupation,"
938+
+ " salary | sort name, occupation",
939+
TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
940+
verifySchema(
941+
actual,
942+
schema("name", "string"),
943+
schema("age", "int"),
944+
schema("state", "string"),
945+
schema("country", "string"),
946+
schema("occupation", "string"),
947+
schema("salary", "int"));
948+
// Rows are listed in the `sort name, occupation` order so verifyDataRowsInOrder also
949+
// validates the sort (David/Doctor < David/Unemployed, then Hello, Jake, Jane, John).
950+
verifyDataRowsInOrder(
951+
actual,
952+
rows("David", 40, "Washington", "USA", "Doctor", 120000),
953+
rows("David", 40, "Washington", "USA", "Unemployed", 0),
954+
rows("Hello", 30, "New York", "USA", "Artist", 70000),
955+
rows("Jake", 70, "California", "USA", "Engineer", 100000),
956+
rows("Jane", 20, "Quebec", "Canada", "Scientist", 90000),
957+
rows("John", 25, "Ontario", "Canada", "Doctor", 120000));
958+
959+
// The shorthand and the explicit qualified-equality form are output-identical.
960+
JSONObject explicitForm =
961+
executeQuery(
962+
String.format(
963+
"source=%s | inner join on %s.name = %s.name %s | fields name, age, state, country,"
964+
+ " occupation, salary | sort name, occupation",
965+
TEST_INDEX_STATE_COUNTRY,
966+
TEST_INDEX_STATE_COUNTRY,
967+
TEST_INDEX_OCCUPATION,
968+
TEST_INDEX_OCCUPATION));
969+
assertJsonEquals(explicitForm.toString(), actual.toString());
970+
}
971+
972+
@Test
973+
public void testJoinNoPrefixImplicitField() throws IOException {
974+
// No-prefix `join on name` matches the explicit qualified form, not the field-list `join name`.
975+
JSONObject actual =
976+
executeQuery(
977+
String.format(
978+
"source=%s | join on name %s | fields name, age, state, occupation, salary | sort"
979+
+ " name, occupation",
980+
TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
981+
JSONObject explicitForm =
982+
executeQuery(
983+
String.format(
984+
"source=%s | join on %s.name = %s.name %s | fields name, age, state, occupation,"
985+
+ " salary | sort name, occupation",
986+
TEST_INDEX_STATE_COUNTRY,
987+
TEST_INDEX_STATE_COUNTRY,
988+
TEST_INDEX_OCCUPATION,
989+
TEST_INDEX_OCCUPATION));
990+
assertJsonEquals(explicitForm.toString(), actual.toString());
991+
}
992+
993+
@Test
994+
public void testLeftJoinWithImplicitField() throws IOException {
995+
// Keep-both does not coalesce, so unmatched-left rows keep the non-null left key, not a NULL.
996+
JSONObject actual =
997+
executeQuery(
998+
String.format(
999+
"source=%s | left join on name %s | fields name, age, state, occupation, salary",
1000+
TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
1001+
verifySchema(
1002+
actual,
1003+
schema("name", "string"),
1004+
schema("age", "int"),
1005+
schema("state", "string"),
1006+
schema("occupation", "string"),
1007+
schema("salary", "int"));
1008+
verifyDataRows(
1009+
actual,
1010+
rows("Jake", 70, "California", "Engineer", 100000),
1011+
rows("Hello", 30, "New York", "Artist", 70000),
1012+
rows("John", 25, "Ontario", "Doctor", 120000),
1013+
rows("Jane", 20, "Quebec", "Scientist", 90000),
1014+
rows("David", 40, "Washington", "Doctor", 120000),
1015+
rows("David", 40, "Washington", "Unemployed", 0),
1016+
rows("Jim", 27, "B.C", null, null),
1017+
rows("Peter", 57, "B.C", null, null),
1018+
rows("Rick", 70, "B.C", null, null));
1019+
}
1020+
1021+
@Test
1022+
public void testAliasedSelfJoinWithImplicitField() throws IOException {
1023+
// Self-join on the unique `name`: a real equi-join yields 8 rows, not the 64-row cross product
1024+
// a tautology would give.
1025+
JSONObject actual =
1026+
executeQuery(
1027+
String.format(
1028+
"source=%s | inner join left=l right=r on name %s | fields name, r.name",
1029+
TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY));
1030+
verifyNumOfRows(actual, 8);
1031+
verifySchema(actual, schema("name", "string"), schema("r.name", "string"));
1032+
}
1033+
9311034
@Test
9321035
public void testJoinComparing() throws IOException {
9331036
JSONObject actual =

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,8 +729,9 @@ sourceFilterArg
729729

730730
// join
731731
joinCommand
732-
: JOIN (joinOption)* (fieldList)? right = tableOrSubqueryClause
733-
| sqlLikeJoinType? JOIN (joinOption)* sideAlias joinHintList? joinCriteria right = tableOrSubqueryClause
732+
// Criteria alt listed first - so `join on a` reads `on` as the criteria keyword, not a field.
733+
: sqlLikeJoinType? JOIN (joinOption)* sideAlias joinHintList? joinCriteria right = tableOrSubqueryClause
734+
| JOIN (joinOption)* (fieldList)? right = tableOrSubqueryClause
734735
;
735736

736737
sqlLikeJoinType

ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,8 @@ public UnresolvedPlan visitJoinCommand(OpenSearchPPLParser.JoinCommandContext ct
326326
if (ctx.fieldList() != null) {
327327
joinFields = Optional.of(getFieldList(ctx.fieldList()));
328328
}
329+
// Keep a bare `on <field>` criteria verbatim; the planner expands it to an equi-join. Folding
330+
// it into joinFields here would instead merge the key into one column.
329331
return new Join(
330332
projectExceptMeta(right),
331333
leftAlias,

0 commit comments

Comments
 (0)