Skip to content

Commit 62b7406

Browse files
authored
[GH-2830] Add broadcast spatial-join support for the Geography type with ST_Contains (#2864)
1 parent df3f740 commit 62b7406

8 files changed

Lines changed: 497 additions & 99 deletions

File tree

common/src/main/java/org/apache/sedona/common/S2Geography/WkbS2Shape.java

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,34 +105,48 @@ public WkbS2Shape(byte[] wkb) {
105105
this.chainLengths = new int[numRings];
106106
this.vertexOffsets = new int[numRings];
107107

108-
// First pass: count total vertices and compute offsets
108+
// First pass: count total vertices and compute offsets. Sedona's WKBWriter writes
109+
// open rings (n unique vertices, no closing duplicate); standard WKB writes closed
110+
// rings (n+1 coords with last == first). Detect the closing-duplicate case by
111+
// comparing the first and last (lon, lat) pair so we get the right edge count
112+
// either way: edges = uniqueVertices = closed ? ringCoords - 1 : ringCoords.
109113
int totalVerts = 0;
110114
int edgeCount = 0;
111115
int byteOffset = payloadOffset + 4;
112116
int[] ringCoordCounts = new int[numRings];
113117
int[] ringByteOffsets = new int[numRings];
118+
boolean[] ringClosed = new boolean[numRings];
114119
for (int r = 0; r < numRings; r++) {
115120
int ringCoords = buf.getInt(byteOffset);
116121
ringCoordCounts[r] = ringCoords;
117122
ringByteOffsets[r] = byteOffset + 4;
123+
boolean closed =
124+
ringCoords >= 2 && firstAndLastEqual(buf, ringByteOffsets[r], ringCoords);
125+
ringClosed[r] = closed;
118126
byteOffset += 4 + ringCoords * 16;
119127

120-
int ringEdges = Math.max(0, ringCoords - 1);
128+
int ringEdges = closed ? Math.max(0, ringCoords - 1) : ringCoords;
129+
int storedVerts = closed ? ringCoords : ringCoords;
121130
chainStarts[r] = edgeCount;
122131
chainLengths[r] = ringEdges;
123132
vertexOffsets[r] = totalVerts;
124133
edgeCount += ringEdges;
125-
totalVerts += ringCoords;
134+
totalVerts += storedVerts + (closed ? 0 : 1); // append closing duplicate for open rings
126135
}
127136
this.totalEdges = edgeCount;
128137

129-
// Second pass: read all vertices at once
138+
// Second pass: read all vertices, appending a closing duplicate for open rings so
139+
// the rest of the shape interface (getEdge, getChainEdge, computeContainsOrigin)
140+
// can index `vertexOffsets[r] + (i % chainLengths[r])` uniformly.
130141
this.vertices = new S2Point[totalVerts];
131142
int vi = 0;
132143
for (int r = 0; r < numRings; r++) {
133144
S2Point[] ringVerts = readVertices(buf, ringByteOffsets[r], ringCoordCounts[r]);
134145
System.arraycopy(ringVerts, 0, vertices, vi, ringVerts.length);
135146
vi += ringVerts.length;
147+
if (!ringClosed[r] && ringVerts.length > 0) {
148+
vertices[vi++] = ringVerts[0];
149+
}
136150
}
137151

138152
// Eagerly compute containsOrigin from first ring
@@ -229,6 +243,18 @@ private int findChain(int edgeId) {
229243
return 0;
230244
}
231245

246+
/**
247+
* Returns true when the ring's first and last vertex compare equal as raw doubles, i.e. the ring
248+
* is closed in the standard WKB sense. Sedona's own WKBWriter produces open rings, so this cheap
249+
* numeric comparison on the in-buffer bytes lets us distinguish the two cases without running
250+
* through the S2Point conversion.
251+
*/
252+
private static boolean firstAndLastEqual(ByteBuffer buf, int byteOffset, int numCoords) {
253+
int lastOffset = byteOffset + (numCoords - 1) * 16;
254+
return buf.getDouble(byteOffset) == buf.getDouble(lastOffset)
255+
&& buf.getDouble(byteOffset + 8) == buf.getDouble(lastOffset + 8);
256+
}
257+
232258
/** Read numCoords (lon, lat) doubles from WKB and convert to S2Points. */
233259
private static S2Point[] readVertices(ByteBuffer buf, int byteOffset, int numCoords) {
234260
S2Point[] pts = new S2Point[numCoords];
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.sedona.common.S2Geography;
20+
21+
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertTrue;
23+
24+
import org.apache.sedona.common.geography.Constructors;
25+
import org.apache.sedona.common.geography.Functions;
26+
import org.junit.Test;
27+
28+
/**
29+
* Localises a Geography ST_Contains correctness bug seen when arguments come from a DataFrame (i.e.
30+
* after a GeographyWKBSerializer round-trip). The WKBGeography fast path (getShapeIndexGeography →
31+
* WkbS2Shape) returns the wrong answer for some polygon-point pairs; the slow path through
32+
* S2Polygon (and direct ST_GeogFromWKT in a SELECT) is correct.
33+
*/
34+
public class WkbContainsRoundtripTest {
35+
36+
/**
37+
* Direct Functions.contains call, no round-trip. Should return false: (10, 10) is far outside the
38+
* small polygon at (2..3, 2..3).
39+
*/
40+
@Test
41+
public void containsIsFalseWithoutRoundTrip() throws Exception {
42+
Geography poly = Constructors.geogFromWKT("POLYGON((2 2, 3 2, 3 3, 2 3, 2 2))", 4326);
43+
Geography pt = Constructors.geogFromWKT("POINT(10 10)", 4326);
44+
assertFalse(Functions.contains(poly, pt));
45+
assertTrue(Functions.contains(poly, Constructors.geogFromWKT("POINT(2.5 2.5)", 4326)));
46+
}
47+
48+
/** Control test mirroring GeographyFunctionTest's "ST_Contains point outside polygon". */
49+
@Test
50+
public void controlPolygonAtOrigin() throws Exception {
51+
Geography poly = Constructors.geogFromWKT("POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))", 4326);
52+
Geography ptOutside = Constructors.geogFromWKT("POINT(2 2)", 4326);
53+
Geography ptInside = Constructors.geogFromWKT("POINT(0.5 0.5)", 4326);
54+
assertFalse("polygon at origin must NOT contain (2, 2)", Functions.contains(poly, ptOutside));
55+
assertTrue("polygon at origin must contain (0.5, 0.5)", Functions.contains(poly, ptInside));
56+
}
57+
58+
/**
59+
* Bypass WkbS2Shape and feed the polygon through PolygonGeography directly. If this passes while
60+
* the equivalent WKBGeography case fails, the bug is localised to WkbS2Shape (or to the
61+
* `result.shapeIndex.add(new WkbS2Shape(...))` path in WKBGeography.getShapeIndexGeography).
62+
*/
63+
@Test
64+
public void bypassWkbS2ShapeViaPolygonGeography() throws Exception {
65+
// Force the slow path: parse via WKTReader then DON'T wrap in WKBGeography.
66+
Geography poly = new WKTReader().read("POLYGON((2 2, 3 2, 3 3, 2 3, 2 2))");
67+
poly.setSRID(4326);
68+
Geography ptOutside = new WKTReader().read("POINT(10 10)");
69+
ptOutside.setSRID(4326);
70+
Geography ptInside = new WKTReader().read("POINT(2.5 2.5)");
71+
ptInside.setSRID(4326);
72+
assertFalse(
73+
"[slow path] polygon at (2..3,2..3) must NOT contain (10, 10)",
74+
Functions.contains(poly, ptOutside));
75+
assertTrue(
76+
"[slow path] polygon at (2..3,2..3) must contain (2.5, 2.5)",
77+
Functions.contains(poly, ptInside));
78+
}
79+
80+
/**
81+
* Same logical inputs, but each Geography goes through the WKB serializer round-trip first —
82+
* which is what happens whenever a GeographyUDT column is read back from a DataFrame.
83+
*/
84+
@Test
85+
public void containsIsFalseAfterWkbRoundTrip() throws Exception {
86+
Geography poly =
87+
GeographyWKBSerializer.deserialize(
88+
GeographyWKBSerializer.serialize(
89+
Constructors.geogFromWKT("POLYGON((2 2, 3 2, 3 3, 2 3, 2 2))", 4326)));
90+
Geography ptOutside =
91+
GeographyWKBSerializer.deserialize(
92+
GeographyWKBSerializer.serialize(Constructors.geogFromWKT("POINT(10 10)", 4326)));
93+
Geography ptInside =
94+
GeographyWKBSerializer.deserialize(
95+
GeographyWKBSerializer.serialize(Constructors.geogFromWKT("POINT(2.5 2.5)", 4326)));
96+
assertFalse(
97+
"polygon at (2..3,2..3) must NOT contain (10, 10)", Functions.contains(poly, ptOutside));
98+
assertTrue(
99+
"polygon at (2..3,2..3) must contain (2.5, 2.5)", Functions.contains(poly, ptInside));
100+
}
101+
}

0 commit comments

Comments
 (0)