Skip to content

Commit 5977265

Browse files
committed
Refactoring for StrikethroughProcessor and XYCutPlusPlusSorter
1 parent 8e3f74a commit 5977265

3 files changed

Lines changed: 32 additions & 59 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/StrikethroughProcessor.java

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,10 @@ public static List<IObject> processStrikethroughs(List<IObject> pageContents) {
9494

9595
if (!matchingChunks.isEmpty() && matchingChunks.size() <= MAX_TEXT_CHUNKS_PER_LINE) {
9696
for (TextChunk chunk : matchingChunks) {
97-
String value = chunk.getValue();
98-
if (!value.startsWith("~~")) {
97+
if (!chunk.getIsStrikethroughText()) {
98+
String value = chunk.getValue();
9999
chunk.setValue("~~" + value + "~~");
100+
chunk.setIsStrikethroughText();
100101
}
101102
}
102103
}
@@ -121,10 +122,7 @@ static boolean isTableBorderLine(LineChunk line) {
121122
* Determines whether a horizontal line is a strikethrough for the given text chunk.
122123
*/
123124
static boolean isStrikethroughLine(LineChunk line, TextChunk textChunk) {
124-
BoundingBox textBox = textChunk.getBoundingBox();
125-
double textBottomY = textBox.getBottomY();
126-
double textTopY = textBox.getTopY();
127-
double textHeight = textTopY - textBottomY;
125+
double textHeight = textChunk.getHeight();
128126

129127
if (textHeight <= 0) {
130128
return false;
@@ -137,19 +135,19 @@ static boolean isStrikethroughLine(LineChunk line, TextChunk textChunk) {
137135
}
138136

139137
// Check vertical position: the line's Y should be near the vertical center of the text
140-
double textCenterY = (textBottomY + textTopY) / 2.0;
141-
double lineY = line.getStartY();
138+
double textCenterY = textChunk.getCenterY();
139+
double lineY = line.getCenterY();
142140
double tolerance = textHeight * VERTICAL_CENTER_TOLERANCE;
143141

144142
if (Math.abs(lineY - textCenterY) > tolerance) {
145143
return false;
146144
}
147145

148146
// Check horizontal overlap
149-
double textLeftX = textBox.getLeftX();
150-
double textRightX = textBox.getRightX();
151-
double lineLeftX = Math.min(line.getStartX(), line.getEndX());
152-
double lineRightX = Math.max(line.getStartX(), line.getEndX());
147+
double textLeftX = textChunk.getLeftX();
148+
double textRightX = textChunk.getRightX();
149+
double lineLeftX = line.getLeftX();
150+
double lineRightX = line.getRightX();
153151

154152
double overlapLeft = Math.max(textLeftX, lineLeftX);
155153
double overlapRight = Math.min(textRightX, lineRightX);
@@ -159,13 +157,13 @@ static boolean isStrikethroughLine(LineChunk line, TextChunk textChunk) {
159157
return false;
160158
}
161159

162-
double textWidth = textRightX - textLeftX;
160+
double textWidth = textChunk.getWidth();
163161
if (textWidth <= 0 || (overlapWidth / textWidth) < MIN_HORIZONTAL_OVERLAP_RATIO) {
164162
return false;
165163
}
166164

167165
// Reject lines that extend far beyond the text
168-
double lineWidth = lineRightX - lineLeftX;
166+
double lineWidth = line.getBoundingBox().getWidth();
169167
if (lineWidth / textWidth > MAX_LINE_TO_TEXT_WIDTH_RATIO) {
170168
return false;
171169
}

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorter.java

Lines changed: 19 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ static List<IObject> identifyCrossLayoutElements(List<IObject> objects, double b
156156
for (IObject obj : objects) {
157157
BoundingBox bbox = obj.getBoundingBox();
158158
if (bbox != null) {
159-
double width = bbox.getRightX() - bbox.getLeftX();
159+
double width = bbox.getWidth();
160160
maxWidth = Math.max(maxWidth, width);
161161
}
162162
}
@@ -171,7 +171,7 @@ static List<IObject> identifyCrossLayoutElements(List<IObject> objects, double b
171171
continue;
172172
}
173173

174-
double width = bbox.getRightX() - bbox.getLeftX();
174+
double width = bbox.getWidth();
175175

176176
// Criterion 1: Width exceeds threshold (close to max width)
177177
if (width >= threshold) {
@@ -239,8 +239,8 @@ static double calculateHorizontalOverlapRatio(BoundingBox box1, BoundingBox box2
239239
return 0;
240240
}
241241

242-
double width1 = box1.getRightX() - box1.getLeftX();
243-
double width2 = box2.getRightX() - box2.getLeftX();
242+
double width1 = box1.getWidth();
243+
double width2 = box2.getWidth();
244244
double smallerWidth = Math.min(width1, width2);
245245

246246
return smallerWidth > 0 ? overlapWidth / smallerWidth : 0;
@@ -267,9 +267,7 @@ static double computeDensityRatio(List<IObject> objects) {
267267
return 1.0;
268268
}
269269

270-
double regionWidth = regionBounds.getRightX() - regionBounds.getLeftX();
271-
double regionHeight = regionBounds.getTopY() - regionBounds.getBottomY();
272-
double regionArea = regionWidth * regionHeight;
270+
double regionArea = regionBounds.getArea();
273271

274272
if (regionArea <= 0) {
275273
return 1.0;
@@ -286,32 +284,14 @@ static double computeDensityRatio(List<IObject> objects) {
286284
* @return Bounding box encompassing all objects, or null if no valid objects
287285
*/
288286
static BoundingBox calculateBoundingRegion(List<IObject> objects) {
289-
double minX = Double.MAX_VALUE;
290-
double maxX = Double.MIN_VALUE;
291-
double minY = Double.MAX_VALUE;
292-
double maxY = Double.MIN_VALUE;
293-
int pageNumber = 0;
294-
boolean found = false;
287+
BoundingBox boundingBox = new BoundingBox();
295288

296289
for (IObject obj : objects) {
297290
BoundingBox bbox = obj.getBoundingBox();
298-
if (bbox == null) {
299-
continue;
300-
}
301-
302-
found = true;
303-
pageNumber = bbox.getPageNumber();
304-
minX = Math.min(minX, bbox.getLeftX());
305-
maxX = Math.max(maxX, bbox.getRightX());
306-
minY = Math.min(minY, bbox.getBottomY());
307-
maxY = Math.max(maxY, bbox.getTopY());
308-
}
309-
310-
if (!found) {
311-
return null;
291+
boundingBox.union(bbox);
312292
}
313293

314-
return new BoundingBox(pageNumber, minX, minY, maxX, maxY);
294+
return boundingBox.isEmpty() ? null : boundingBox;
315295
}
316296

317297
/**
@@ -325,9 +305,7 @@ static double calculateTotalArea(List<IObject> objects) {
325305
for (IObject obj : objects) {
326306
BoundingBox bbox = obj.getBoundingBox();
327307
if (bbox != null) {
328-
double width = bbox.getRightX() - bbox.getLeftX();
329-
double height = bbox.getTopY() - bbox.getBottomY();
330-
totalArea += width * height;
308+
totalArea += bbox.getArea();
331309
}
332310
}
333311
return totalArea;
@@ -443,12 +421,12 @@ private static CutInfo findBestVerticalCutWithProjection(List<IObject> objects)
443421
if (objects.size() >= 3) {
444422
BoundingBox region = calculateBoundingRegion(objects);
445423
if (region != null) {
446-
double regionWidth = region.getRightX() - region.getLeftX();
424+
double regionWidth = region.getWidth();
447425
double narrowThreshold = regionWidth * NARROW_ELEMENT_WIDTH_RATIO;
448426
List<IObject> filtered = new ArrayList<>();
449427
for (IObject obj : objects) {
450428
BoundingBox bbox = obj.getBoundingBox();
451-
double width = bbox.getRightX() - bbox.getLeftX();
429+
double width = bbox.getWidth();
452430
if (width >= narrowThreshold) {
453431
filtered.add(obj);
454432
}
@@ -479,9 +457,8 @@ private static CutInfo findVerticalCutByEdges(List<IObject> objects) {
479457
Double prevRight = null;
480458

481459
for (IObject obj : sorted) {
482-
BoundingBox bbox = obj.getBoundingBox();
483-
double left = bbox.getLeftX();
484-
double right = bbox.getRightX();
460+
double left = obj.getLeftX();
461+
double right = obj.getRightX();
485462

486463
if (prevRight != null && left > prevRight) {
487464
double gap = left - prevRight;
@@ -519,9 +496,8 @@ private static CutInfo findBestHorizontalCutWithProjection(List<IObject> objects
519496
Double prevBottom = null;
520497

521498
for (IObject obj : sorted) {
522-
BoundingBox bbox = obj.getBoundingBox();
523-
double top = bbox.getTopY();
524-
double bottom = bbox.getBottomY();
499+
double top = obj.getTopY();
500+
double bottom = obj.getBottomY();
525501

526502
if (prevBottom != null && prevBottom > top) {
527503
double gap = prevBottom - top;
@@ -550,9 +526,8 @@ static List<List<IObject>> splitByHorizontalCut(List<IObject> objects, double cu
550526
List<IObject> below = new ArrayList<>();
551527

552528
for (IObject obj : objects) {
553-
BoundingBox bbox = obj.getBoundingBox();
554529
// Use center Y to determine which group
555-
double centerY = (bbox.getTopY() + bbox.getBottomY()) / 2.0;
530+
double centerY = obj.getCenterY();
556531
if (centerY > cutY) {
557532
above.add(obj);
558533
} else {
@@ -583,9 +558,8 @@ static List<List<IObject>> splitByVerticalCut(List<IObject> objects, double cutX
583558
List<IObject> right = new ArrayList<>();
584559

585560
for (IObject obj : objects) {
586-
BoundingBox bbox = obj.getBoundingBox();
587561
// Use center X to determine which group
588-
double centerX = (bbox.getLeftX() + bbox.getRightX()) / 2.0;
562+
double centerX = obj.getCenterX();
589563
if (centerX < cutX) {
590564
left.add(obj);
591565
} else {
@@ -641,8 +615,8 @@ static List<IObject> mergeCrossLayoutElements(List<IObject> sortedMain, List<IOb
641615
IObject mainObj = sortedMain.get(mainIndex);
642616
IObject crossObj = sortedCrossLayout.get(crossIndex);
643617

644-
double mainTopY = mainObj.getBoundingBox().getTopY();
645-
double crossTopY = crossObj.getBoundingBox().getTopY();
618+
double mainTopY = mainObj.getTopY();
619+
double crossTopY = crossObj.getTopY();
646620

647621
if (crossTopY >= mainTopY) {
648622
// Cross-layout element is above or at same level, add it first

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/StrikethroughProcessorTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ public void testDoubleWrappingPrevented() {
152152

153153
TextChunk textChunk = new TextChunk(new BoundingBox(0, 10.0, 100.0, 60.0, 120.0),
154154
"~~already~~", 12, 100.0);
155+
textChunk.setIsStrikethroughText();
155156
contents.add(textChunk);
156157

157158
LineChunk line = LineChunk.createLineChunk(0, 10.0, 110.0, 60.0, 110.0, 1.0,

0 commit comments

Comments
 (0)