|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | +package org.apache.sedona.sql |
| 20 | + |
| 21 | +import org.apache.spark.sql.DataFrame |
| 22 | +import org.apache.spark.sql.functions.{broadcast, expr} |
| 23 | +import org.apache.spark.sql.sedona_sql.strategy.join.{BroadcastIndexJoinExec, RangeJoinExec} |
| 24 | + |
| 25 | +class Box2DJoinSuite extends TestBaseScala { |
| 26 | + |
| 27 | + import Box2DJoinSuite.TestBox |
| 28 | + |
| 29 | + /** |
| 30 | + * Three left-side boxes and three right-side boxes wired so we can predict exact result sizes: |
| 31 | + * |
| 32 | + * - L1=(0,0,10,10) R1=(5,5,15,15) — overlapping |
| 33 | + * - L1=(0,0,10,10) R2=(2,2,8,8) — R2 fully inside L1 |
| 34 | + * - L2=(0,0,10,10) R1=(5,5,15,15) — overlapping |
| 35 | + * - L2=(0,0,10,10) R2=(2,2,8,8) — R2 fully inside L2 |
| 36 | + * - L3 and R3 are disjoint from everything else; (L3,R3) is itself disjoint. |
| 37 | + * |
| 38 | + * Intersection-pair count: 4. Containment-pair count: 2 (L1⊇R2, L2⊇R2). |
| 39 | + */ |
| 40 | + private def leftBoxes: DataFrame = { |
| 41 | + import sparkSession.implicits._ |
| 42 | + Seq(TestBox(1, 0, 0, 10, 10), TestBox(2, 0, 0, 10, 10), TestBox(3, 20, 20, 30, 30)) |
| 43 | + .toDF("id", "xmin", "ymin", "xmax", "ymax") |
| 44 | + .selectExpr("id", "ST_MakeBox2D(ST_Point(xmin, ymin), ST_Point(xmax, ymax)) AS box") |
| 45 | + } |
| 46 | + |
| 47 | + private def rightBoxes: DataFrame = { |
| 48 | + import sparkSession.implicits._ |
| 49 | + Seq(TestBox(11, 5, 5, 15, 15), TestBox(12, 2, 2, 8, 8), TestBox(13, 40, 40, 50, 50)) |
| 50 | + .toDF("id", "xmin", "ymin", "xmax", "ymax") |
| 51 | + .selectExpr("id", "ST_MakeBox2D(ST_Point(xmin, ymin), ST_Point(xmax, ymax)) AS box") |
| 52 | + } |
| 53 | + |
| 54 | + describe("Box2D spatial join") { |
| 55 | + |
| 56 | + it("ST_BoxIntersects: broadcast index join produces correct pairs") { |
| 57 | + val df = leftBoxes |
| 58 | + .alias("L") |
| 59 | + .join(broadcast(rightBoxes.alias("R")), expr("ST_BoxIntersects(L.box, R.box)")) |
| 60 | + val plan = df.queryExecution.sparkPlan |
| 61 | + assert( |
| 62 | + plan.collect { case b: BroadcastIndexJoinExec => b }.size == 1, |
| 63 | + "Expected BroadcastIndexJoinExec in the plan") |
| 64 | + assert(df.count() == 4) |
| 65 | + } |
| 66 | + |
| 67 | + it("ST_BoxIntersects: argument order is symmetric") { |
| 68 | + val swapped = leftBoxes |
| 69 | + .alias("L") |
| 70 | + .join(broadcast(rightBoxes.alias("R")), expr("ST_BoxIntersects(R.box, L.box)")) |
| 71 | + assert(swapped.count() == 4) |
| 72 | + assert(swapped.queryExecution.sparkPlan.collect { case b: BroadcastIndexJoinExec => |
| 73 | + b |
| 74 | + }.size == 1) |
| 75 | + } |
| 76 | + |
| 77 | + it("ST_BoxContains: broadcast index join uses COVERS semantics") { |
| 78 | + val df = leftBoxes |
| 79 | + .alias("L") |
| 80 | + .join(broadcast(rightBoxes.alias("R")), expr("ST_BoxContains(L.box, R.box)")) |
| 81 | + assert(df.queryExecution.sparkPlan.collect { case b: BroadcastIndexJoinExec => |
| 82 | + b |
| 83 | + }.size == 1) |
| 84 | + assert(df.count() == 2) |
| 85 | + } |
| 86 | + |
| 87 | + it("ST_BoxContains: edge-touching boxes count (closed-interval semantics)") { |
| 88 | + // R contained in L sharing an edge: ST_BoxContains is closed-interval, so this matches. |
| 89 | + // JTS Polygon.contains would reject (strict-interior), JTS Polygon.covers accepts; the |
| 90 | + // detector maps ST_BoxContains → SpatialPredicate.COVERS specifically for this case. |
| 91 | + import sparkSession.implicits._ |
| 92 | + val outer = Seq(TestBox(1, 0, 0, 10, 10)) |
| 93 | + .toDF("id", "xmin", "ymin", "xmax", "ymax") |
| 94 | + .selectExpr("id", "ST_MakeBox2D(ST_Point(xmin, ymin), ST_Point(xmax, ymax)) AS box") |
| 95 | + // edge-sharing box: same xmax, shares the right edge with outer. |
| 96 | + val inner = Seq(TestBox(11, 5, 5, 10, 10)) |
| 97 | + .toDF("id", "xmin", "ymin", "xmax", "ymax") |
| 98 | + .selectExpr("id", "ST_MakeBox2D(ST_Point(xmin, ymin), ST_Point(xmax, ymax)) AS box") |
| 99 | + val df = outer |
| 100 | + .alias("O") |
| 101 | + .join(broadcast(inner.alias("I")), expr("ST_BoxContains(O.box, I.box)")) |
| 102 | + assert(df.count() == 1, "Closed-interval containment must include edge-touching boxes") |
| 103 | + } |
| 104 | + |
| 105 | + it("ST_BoxIntersects: non-broadcast range join produces the same count") { |
| 106 | + val df = leftBoxes |
| 107 | + .alias("L") |
| 108 | + .join(rightBoxes.alias("R"), expr("ST_BoxIntersects(L.box, R.box)")) |
| 109 | + assert( |
| 110 | + df.queryExecution.sparkPlan.collect { case r: RangeJoinExec => r }.size == 1, |
| 111 | + "Expected RangeJoinExec in the plan") |
| 112 | + assert(df.count() == 4) |
| 113 | + } |
| 114 | + |
| 115 | + it("Null Box2D rows are safe and produce no matches") { |
| 116 | + // A null shape on either side must not crash the executor and must not contribute matches |
| 117 | + // (mirrors the existing GeometrySerializer.deserialize(null) → empty-collection fallback). |
| 118 | + import sparkSession.implicits._ |
| 119 | + val withNullLeft = leftBoxes |
| 120 | + .selectExpr("id", "box AS box") |
| 121 | + .union(Seq((99, null.asInstanceOf[org.apache.sedona.common.geometryObjects.Box2D])) |
| 122 | + .toDF("id", "box")) |
| 123 | + val df = withNullLeft |
| 124 | + .alias("L") |
| 125 | + .join(broadcast(rightBoxes.alias("R")), expr("ST_BoxIntersects(L.box, R.box)")) |
| 126 | + assert(df.count() == 4) // unchanged from the non-null fixture |
| 127 | + // Range join path (no broadcast) also tolerates nulls. |
| 128 | + val rangeDf = withNullLeft |
| 129 | + .alias("L") |
| 130 | + .join(rightBoxes.alias("R"), expr("ST_BoxIntersects(L.box, R.box)")) |
| 131 | + assert(rangeDf.count() == 4) |
| 132 | + } |
| 133 | + |
| 134 | + it("Inverted Box2D bounds in a join throw IllegalArgumentException") { |
| 135 | + import sparkSession.implicits._ |
| 136 | + // Construct an inverted Box2D directly via the Java constructor (the SQL ST_MakeBox2D |
| 137 | + // doesn't validate, so this is how a stored column with inverted bounds would look). |
| 138 | + val invertedLeft = |
| 139 | + Seq((1, new org.apache.sedona.common.geometryObjects.Box2D(10.0, 0.0, 0.0, 10.0))) |
| 140 | + .toDF("id", "box") |
| 141 | + val df = invertedLeft |
| 142 | + .alias("L") |
| 143 | + .join(broadcast(rightBoxes.alias("R")), expr("ST_BoxIntersects(L.box, R.box)")) |
| 144 | + // Confirm the join is actually planned as BroadcastIndexJoinExec so the throw originates |
| 145 | + // from the join-side `shapeToGeometry` validation, not from a row-by-row fallback that |
| 146 | + // also happens to throw via `Predicates.boxIntersects`. |
| 147 | + assert( |
| 148 | + df.queryExecution.sparkPlan.collect { case b: BroadcastIndexJoinExec => b }.size == 1, |
| 149 | + "Expected BroadcastIndexJoinExec — without it the test could pass via row-by-row " + |
| 150 | + "predicate evaluation, hiding a regression in join optimization") |
| 151 | + val ex = intercept[org.apache.spark.SparkException](df.collect()) |
| 152 | + val cause = Iterator |
| 153 | + .iterate(ex: Throwable)(_.getCause) |
| 154 | + .takeWhile(_ != null) |
| 155 | + .find(_.isInstanceOf[IllegalArgumentException]) |
| 156 | + assert(cause.isDefined, s"Expected IllegalArgumentException in cause chain, got: $ex") |
| 157 | + assert(cause.get.getMessage.contains("inverted bounds")) |
| 158 | + } |
| 159 | + |
| 160 | + it("Result is equivalent to ST_Intersects on the Box2D-as-polygon envelopes") { |
| 161 | + val viaBox = leftBoxes |
| 162 | + .alias("L") |
| 163 | + .join(broadcast(rightBoxes.alias("R")), expr("ST_BoxIntersects(L.box, R.box)")) |
| 164 | + .selectExpr("L.id AS l", "R.id AS r") |
| 165 | + .orderBy("l", "r") |
| 166 | + .collect() |
| 167 | + .toSeq |
| 168 | + |
| 169 | + // ST_GeomFromBox2D is the function-form equivalent of `CAST(box AS geometry)`. The cast |
| 170 | + // syntax requires the Sedona SQL parser extension; this suite runs under the common test |
| 171 | + // base, which doesn't wire that extension, so we go through the function form here. |
| 172 | + val asPolygons = leftBoxes |
| 173 | + .selectExpr("id", "ST_GeomFromBox2D(box) AS g") |
| 174 | + .alias("L") |
| 175 | + .join( |
| 176 | + broadcast(rightBoxes.selectExpr("id", "ST_GeomFromBox2D(box) AS g").alias("R")), |
| 177 | + expr("ST_Intersects(L.g, R.g)")) |
| 178 | + .selectExpr("L.id AS l", "R.id AS r") |
| 179 | + .orderBy("l", "r") |
| 180 | + .collect() |
| 181 | + .toSeq |
| 182 | + |
| 183 | + assert(viaBox == asPolygons) |
| 184 | + } |
| 185 | + } |
| 186 | + |
| 187 | +} |
| 188 | + |
| 189 | +object Box2DJoinSuite { |
| 190 | + // Top-level case class so Spark's encoder doesn't need an outer-class reference. |
| 191 | + case class TestBox(id: Int, xmin: Double, ymin: Double, xmax: Double, ymax: Double) |
| 192 | +} |
0 commit comments