Skip to content

Commit af23c11

Browse files
committed
add update + merge test cases with NaN inputs
1 parent 91e9d26 commit af23c11

2 files changed

Lines changed: 145 additions & 27 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/BoundingBox.java

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,13 @@ public boolean isValid() {
9292

9393
/**
9494
* Checks if the bounding box is empty.
95-
* A bounding box is considered empty if all bounds are in their initial state
96-
* (i.e., xMin = +Infinity, xMax = -Infinity, etc.).
95+
* A bounding box is considered empty if any bounds are in their initial state
9796
*
9897
* @return true if the bounding box is empty, false otherwise.
9998
*/
10099
public boolean isEmpty() {
101-
return Double.isInfinite(xMin) && Double.isInfinite(xMax) && Double.isInfinite(yMin) && Double.isInfinite(yMax);
100+
return (Double.isInfinite(xMin) && Double.isInfinite(xMax))
101+
|| (Double.isInfinite(yMin) && Double.isInfinite(yMax));
102102
}
103103

104104
/**
@@ -107,7 +107,7 @@ public boolean isEmpty() {
107107
* @param other the other BoundingBox to merge
108108
*/
109109
public void merge(BoundingBox other) {
110-
if (other == null) {
110+
if (other == null || other.isEmpty()) {
111111
return;
112112
}
113113
this.xMin = Math.min(this.xMin, other.xMin);
@@ -146,12 +146,19 @@ public void update(Geometry geometry) {
146146

147147
/**
148148
* Updates the bounding box with the given bounds.
149+
* Only updates X bounds if both minX and maxX are not NaN.
150+
* Only updates Y bounds if both minY and maxY are not NaN.
149151
*/
150152
private void updateBounds(double minX, double maxX, double minY, double maxY) {
151-
xMin = Math.min(xMin, minX);
152-
xMax = Math.max(xMax, maxX);
153-
yMin = Math.min(yMin, minY);
154-
yMax = Math.max(yMax, maxY);
153+
if (!Double.isNaN(minX) && !Double.isNaN(maxX)) {
154+
xMin = Math.min(xMin, minX);
155+
xMax = Math.max(xMax, maxX);
156+
}
157+
158+
if (!Double.isNaN(minY) && !Double.isNaN(maxY)) {
159+
yMin = Math.min(yMin, minY);
160+
yMax = Math.max(yMax, maxY);
161+
}
155162
}
156163

157164
/**

parquet-column/src/test/java/org/apache/parquet/column/statistics/geometry/TestBoundingBox.java

Lines changed: 130 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,7 @@ public void testNaNCoordinates() {
8282
boundingBox.update(nanPoint);
8383

8484
// All values should be NaN after updating with all-NaN coordinates
85-
Assert.assertEquals("XMin should be NaN", Double.NaN, boundingBox.getXMin(), 0.0);
86-
Assert.assertEquals("XMax should be NaN", Double.NaN, boundingBox.getXMax(), 0.0);
87-
Assert.assertEquals("YMin should be NaN", Double.NaN, boundingBox.getYMin(), 0.0);
88-
Assert.assertEquals("YMax should be NaN", Double.NaN, boundingBox.getYMax(), 0.0);
85+
Assert.assertTrue(boundingBox.isEmpty());
8986

9087
// Reset the bounding box for the next test
9188
boundingBox = new BoundingBox();
@@ -95,10 +92,7 @@ public void testNaNCoordinates() {
9592
boundingBox.update(mixedPoint);
9693

9794
// The valid X coordinate should be used, Y should remain at initial values
98-
Assert.assertEquals("XMin should be 15.0", 15.0, boundingBox.getXMin(), 0.0);
99-
Assert.assertEquals("XMax should be 15.0", 15.0, boundingBox.getXMax(), 0.0);
100-
Assert.assertEquals("YMin should be NaN", Double.NaN, boundingBox.getYMin(), 0.0);
101-
Assert.assertEquals("YMax should be NaN", Double.NaN, boundingBox.getYMax(), 0.0);
95+
Assert.assertTrue(boundingBox.isEmpty());
10296
}
10397

10498
@Test
@@ -254,8 +248,8 @@ public void testUpdateWithAllNaNCoordinatesAfterValid() {
254248
Point nanPoint = gf.createPoint(new Coordinate(Double.NaN, Double.NaN));
255249
box.update(nanPoint);
256250

257-
// Check that box1 is invalid after the merge
258-
Assert.assertFalse("Box should be invalid after the merge", box.isValid());
251+
Assert.assertFalse("Box should be empty after the merge", box.isEmpty());
252+
Assert.assertTrue("Box should be valid after the merge", box.isValid());
259253
}
260254

261255
@Test
@@ -435,11 +429,8 @@ public void testLineStringWithNaNCoordinates() {
435429

436430
box3.update(gf.createLineString(coords3));
437431

438-
// Should result in NaN for all values
439-
Assert.assertEquals(Double.NaN, box3.getXMin(), 0.0);
440-
Assert.assertEquals(Double.NaN, box3.getXMax(), 0.0);
441-
Assert.assertEquals(Double.NaN, box3.getYMin(), 0.0);
442-
Assert.assertEquals(Double.NaN, box3.getYMax(), 0.0);
432+
// The bounding box should remain empty
433+
Assert.assertTrue(box3.isEmpty());
443434
}
444435

445436
@Test
@@ -466,10 +457,130 @@ public void testLineStringWithPartialNaNCoordinates() {
466457
new Coordinate[] {new Coordinate(Double.NaN, 5), new Coordinate(6, Double.NaN), new Coordinate(7, 8)};
467458

468459
box2.update(gf.createLineString(coords2));
460+
Assert.assertTrue(box2.isEmpty());
461+
}
462+
463+
/**
464+
* Tests the end-to-end case for updating and merging bounding boxes with mixed valid and NaN coordinates.
465+
*
466+
* Scenario - Parquet file with multiple row groups:
467+
* file-level bbox: [1, 8, 100, 800]
468+
*
469+
* Row group 1: [1, 2, 100, 100]
470+
* - POINT (1, 100)
471+
* - POINT (2, NaN)
472+
*
473+
* Row group 2: [3, 3, 300, 300]
474+
* - POINT (3, 300)
475+
* - POINT (NaN, NaN)
476+
*
477+
* Row group 3: no valid bbox
478+
* - POINT (5, NaN)
479+
* - POINT (6, NaN)
480+
*
481+
* Row group 4: [7, 8, 700, 800]
482+
* - POINT (7, 700)
483+
* - POINT (8, 800)
484+
*
485+
* Row group 5: no valid bbox
486+
* - POINT (NaN, NaN)
487+
* - POINT (NaN, NaN)
488+
*
489+
* The test verifies that:
490+
* 1. Individual row group bounding boxes correctly handle NaN coordinates
491+
* 2. The merge operation correctly combines valid bounding boxes and ignores invalid ones
492+
* 3. The resulting file-level bounding box correctly represents the overall spatial extent [1, 8, 100, 800]
493+
* 4. The merge operation is commutative - the order of merging does not affect the result
494+
*/
495+
@Test
496+
public void testMergingRowGroupBoundingBoxes() {
497+
GeometryFactory gf = new GeometryFactory();
469498

470-
Assert.assertEquals(Double.NaN, box2.getXMin(), 0.0);
471-
Assert.assertEquals(Double.NaN, box2.getXMax(), 0.0);
472-
Assert.assertEquals(5.0, box2.getYMin(), 0.0);
473-
Assert.assertEquals(8.0, box2.getYMax(), 0.0);
499+
// File-level bounding box (to be computed by merging row group boxes)
500+
BoundingBox fileBBox = new BoundingBox();
501+
502+
// Row Group 1: [1, 2, 100, 100]
503+
BoundingBox rowGroup1 = new BoundingBox();
504+
rowGroup1.update(gf.createPoint(new Coordinate(1, 100)));
505+
// Point with NaN Y-coordinate
506+
rowGroup1.update(gf.createPoint(new Coordinate(2, Double.NaN)));
507+
508+
// Verify Row Group 1
509+
Assert.assertEquals(1.0, rowGroup1.getXMin(), 0.0);
510+
Assert.assertEquals(2.0, rowGroup1.getXMax(), 0.0);
511+
Assert.assertEquals(100.0, rowGroup1.getYMin(), 0.0);
512+
Assert.assertEquals(100.0, rowGroup1.getYMax(), 0.0);
513+
Assert.assertTrue(rowGroup1.isValid());
514+
515+
// Row Group 2: [3, 3, 300, 300]
516+
BoundingBox rowGroup2 = new BoundingBox();
517+
rowGroup2.update(gf.createPoint(new Coordinate(3, 300)));
518+
// Point with all NaN coordinates
519+
Coordinate nanCoord = new Coordinate(Double.NaN, Double.NaN);
520+
rowGroup2.update(gf.createPoint(nanCoord));
521+
522+
// Verify Row Group 2
523+
Assert.assertEquals(3.0, rowGroup2.getXMin(), 0.0);
524+
Assert.assertEquals(3.0, rowGroup2.getXMax(), 0.0);
525+
Assert.assertEquals(300.0, rowGroup2.getYMin(), 0.0);
526+
Assert.assertEquals(300.0, rowGroup2.getYMax(), 0.0);
527+
Assert.assertTrue(rowGroup2.isValid());
528+
529+
// Row Group 3: No defined bbox due to NaN Y-coordinates
530+
BoundingBox rowGroup3 = new BoundingBox();
531+
rowGroup3.update(gf.createPoint(new Coordinate(5, Double.NaN)));
532+
rowGroup3.update(gf.createPoint(new Coordinate(6, Double.NaN)));
533+
534+
// Verify Row Group 3
535+
Assert.assertTrue(rowGroup3.isEmpty());
536+
537+
// Row Group 4: [7, 8, 700, 800]
538+
BoundingBox rowGroup4 = new BoundingBox();
539+
rowGroup4.update(gf.createPoint(new Coordinate(7, 700)));
540+
rowGroup4.update(gf.createPoint(new Coordinate(8, 800)));
541+
542+
// Verify Row Group 4
543+
Assert.assertEquals(7.0, rowGroup4.getXMin(), 0.0);
544+
Assert.assertEquals(8.0, rowGroup4.getXMax(), 0.0);
545+
Assert.assertEquals(700.0, rowGroup4.getYMin(), 0.0);
546+
Assert.assertEquals(800.0, rowGroup4.getYMax(), 0.0);
547+
Assert.assertTrue(rowGroup4.isValid());
548+
549+
// Row Group 5: No defined bbox due to all NaN coordinates
550+
BoundingBox rowGroup5 = new BoundingBox();
551+
rowGroup5.update(gf.createPoint(nanCoord));
552+
rowGroup5.update(gf.createPoint(nanCoord));
553+
554+
// Verify Row Group 5
555+
Assert.assertTrue(rowGroup5.isEmpty());
556+
557+
// Merge row group boxes into file-level box
558+
fileBBox.merge(rowGroup1);
559+
fileBBox.merge(rowGroup2);
560+
fileBBox.merge(rowGroup3);
561+
fileBBox.merge(rowGroup4);
562+
fileBBox.merge(rowGroup5);
563+
564+
// Verify file-level bounding box
565+
// Expected: [1, 8, 100, 800] from valid coordinates
566+
Assert.assertEquals(1.0, fileBBox.getXMin(), 0.0);
567+
Assert.assertEquals(8.0, fileBBox.getXMax(), 0.0);
568+
Assert.assertEquals(100.0, fileBBox.getYMin(), 0.0);
569+
Assert.assertEquals(800.0, fileBBox.getYMax(), 0.0);
570+
Assert.assertTrue(fileBBox.isValid());
571+
572+
// Test merging in reverse order to ensure commutativity
573+
BoundingBox reverseMergeBox = new BoundingBox();
574+
reverseMergeBox.merge(rowGroup5);
575+
reverseMergeBox.merge(rowGroup4);
576+
reverseMergeBox.merge(rowGroup3);
577+
reverseMergeBox.merge(rowGroup2);
578+
reverseMergeBox.merge(rowGroup1);
579+
580+
Assert.assertEquals(1.0, reverseMergeBox.getXMin(), 0.0);
581+
Assert.assertEquals(8.0, reverseMergeBox.getXMax(), 0.0);
582+
Assert.assertEquals(100.0, reverseMergeBox.getYMin(), 0.0);
583+
Assert.assertEquals(800.0, reverseMergeBox.getYMax(), 0.0);
584+
Assert.assertTrue(reverseMergeBox.isValid());
474585
}
475586
}

0 commit comments

Comments
 (0)