2323from ocrd_modelfactory import page_from_file
2424from ocrd_models .ocrd_page import (
2525 PageType ,
26+ BorderType ,
2627 TextRegionType ,
2728 to_xml
2829)
@@ -55,18 +56,18 @@ def __init__(self, *args, **kwargs):
5556
5657 def process (self ):
5758 """Perform generic post-processing of page segmentation with Shapely and OpenCV.
58-
59+
5960 Open and deserialize PAGE input files and their respective images,
6061 then validate syntax and semantics, checking for invalid or inconsistent
6162 segmentation. Fix invalidities by simplifying and/or re-ordering polygon paths.
6263 Fix inconsistencies by shrinking segment polygons to their parents. Log
6364 errors that cannot be repaired automatically.
64-
65+
6566 Next, if ``simplify`` is non-zero, then for each segment (top-level page or
6667 recursive region, line, word, glyph), simplify the polygon points up to that
6768 precision, while preserving its topology and parent-child consistency.
6869 (This will usually reduce the number of points.)
69-
70+
7071 \b
7172 Next, if ``plausibilize``, then for each segment (top-level page or recursive region)
7273 which contains any text regions, try to find all pairs of such regions in it that
@@ -88,14 +89,21 @@ def process(self):
8889 (a fraction of more than ``plausibilize_merge_min_overlap``),
8990 then the one line can be merged into the other.
9091 * If another overlap, and
91- - if either line's centroid is in the other,
92+ - if either line's centroid is in the other,
9293 then the smaller line can be merged into the larger,
9394 - otherwise the smaller line can be subtracted from the larger.
9495 Apply those repairs and update the reading order.
95-
96- Furthermore, if ``sanitize``, then for each text region, update
97- the coordinates to become the minimal convex hull of its constituent
98- text lines. (But consider running ocrd-segment-project instead.)
96+
97+ Next, if ``spread`` is non-zero, then enlarge each ``spread_level`` segment
98+ by this many pixels (without causing additional overlap between neighbours).
99+
100+ However, if ``sanitize``, then as a first step (prior to everything else
101+ including repairs), for each text region, update the coordinates to become
102+ the minimal convex hull of its binary foreground. (So in contrast to
103+ ocrd-segment-project, this ignores constituent lines. It uses the binarized
104+ image and generates a tight outline properly contained within the old
105+ region outline, as if extended by ``sanitize_padding``. If ``spread`` is
106+ non-zero and ``spread_level=region``, then this still applies to the result.)
99107
100108 Finally, produce new output files by serialising the resulting hierarchy.
101109 """
@@ -188,6 +196,8 @@ def process(self):
188196 # delete/merge/split redundant text regions (or its text lines)
189197 if self .parameter ['plausibilize' ]:
190198 self .plausibilize_page (page , page_id )
199+ if self .parameter ['spread' ]:
200+ self .spread_segments (page , page_id )
191201
192202 self .workspace .add_file (
193203 ID = file_id ,
@@ -216,7 +226,7 @@ def simplify_page(self, page, page_id):
216226 ensure_consistent (region , at_parent = True )
217227 if page .get_Border () is not None :
218228 ensure_consistent (page )
219-
229+
220230 def plausibilize_page (self , page , page_id ):
221231 ro = page .get_ReadingOrder ()
222232 if ro :
@@ -286,18 +296,50 @@ def plausibilize_page(self, page, page_id):
286296 marked_for_deletion ,
287297 marked_for_merging ,
288298 marked_for_splitting )
289-
299+
300+ def spread_segments (self , page , page_id ):
301+ level = self .parameter ['spread_level' ]
302+ if level == 'page' :
303+ border = page .get_Border ()
304+ if border is not None :
305+ spread_segments ([border ], self .parameter ['spread' ])
306+ return
307+ if level == 'table' :
308+ for table in page .get_TableRegion ():
309+ cells = table .get_TextRegion ()
310+ spread_segments (cells , self .parameter ['spread' ])
311+ return
312+ regions = page .get_AllRegions (depth = 1 )
313+ if level == 'region' :
314+ spread_segments (regions , self .parameter ['spread' ])
315+ return
316+ for region in regions :
317+ if not isinstance (region , TextRegionType ):
318+ continue
319+ lines = region .get_TextLine ()
320+ if level == 'line' :
321+ spread_segments (lines , self .parameter ['spread' ])
322+ continue
323+ for line in lines :
324+ words = line .get_Word ()
325+ if level == 'word' :
326+ spread_segments (words , self .parameter ['spread' ])
327+ continue
328+ for word in words :
329+ glyphs = word .get_Glyph ()
330+ spread_segments (glyphs , self .parameter ['spread' ])
331+
290332def _compare_segments (seg1 , seg2 , poly1 , poly2 , marked_for_deletion , marked_for_merging , min_overlap , page_id ):
291333 """Determine redundancies in a pair of regions/lines
292-
334+
293335 \b
294336 For segments ``seg1`` (with coordinates ``poly1``) and ``seg2`` (with coordinates ``poly2``),
295337 - if their coordinates are nearly identical, then just mark ``seg2`` for deletion
296338 - if either properly contains the other, then mark the other for deletion
297339 - if they overlap, then mark the most overlapped side in favour of the other – unless
298340 - the union is larger than the sum (i.e. covers area outside of both) and
299341 - the intersection is smaller than ``min_overlap`` fraction of either side
300-
342+
301343 Return whether something else besides deletion must be done about the redundancy,
302344 i.e. true iff they overlap, but neither side could be marked for deletion.
303345 """
@@ -347,7 +389,7 @@ def _compare_segments(seg1, seg2, poly1, poly2, marked_for_deletion, marked_for_
347389
348390def _merge_segments (seg , superseg , poly , superpoly , segpolys , reading_order ):
349391 """Merge one segment into another and update reading order refs.
350-
392+
351393 \b
352394 Given a region/line ``seg`` that should be dissolved into a
353395 region/line ``superseg``, update the latter's
@@ -432,17 +474,17 @@ def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order):
432474 LOG .warning ('Merging "{}" with TextEquiv {} into "{}" with {}' .format (
433475 seg .id , seg .get_TextEquiv (), # FIXME needs repr...
434476 superseg .id , superseg .get_TextEquiv ())) # ...to be informative
435-
477+
436478def _plausibilize_segments (segpolys , rogroup , marked_for_deletion , marked_for_merging , marked_for_splitting ):
437479 """Remove redundancy among a set of segments by applying deletion/merging/splitting
438-
480+
439481 \b
440482 Given the segment-polygon tuples ``segpolys`` and analysis of actions to be taken:
441483 - ``marked_for_deletion``: list of segment identifiers that can be removed,
442484 - ``marked_for_merging``: dict of segment identifiers that can be dissolved into some other,
443485 - ``marked_for_splitting``: dict of segment identifiers that can be shrinked in favour of some other,
444486 apply these one by one (possibly recursing from regions to lines).
445-
487+
446488 Finally, update the reading order ``rogroup`` accordingly.
447489 """
448490 LOG = getLogger ('processor.RepairSegmentation' )
@@ -498,7 +540,7 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me
498540
499541def page_get_reading_order (ro , rogroup ):
500542 """Add all elements from the given reading order group to the given dictionary.
501-
543+
502544 Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
503545 and an object ``rogroup`` with additional ReadingOrder element objects,
504546 add all references to the dict, traversing the group recursively.
@@ -568,6 +610,17 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0):
568610 LOG .debug ('Using new coordinates for region "%s"' , region .id )
569611 region .get_Coords ().set_points (points_from_polygon (region_polygon .exterior .coords [:- 1 ]))
570612
613+ def spread_segments (segments , distance = 0 ):
614+ polygons = [Polygon (polygon_from_points (segment .get_Coords ().points ))
615+ for segment in segments ]
616+ all_poly = unary_union (polygons )
617+ for segment , polygon in zip (segments , polygons ):
618+ # enlarge by spread, then remove any existing segments except for original outline
619+ polygon = merge_poly (polygon , polygon .buffer (distance ).difference (all_poly ))
620+ polygon = polygon .exterior .coords [:- 1 ]
621+ segment .get_Coords ().set_points (points_from_polygon (polygon ))
622+ ensure_consistent (segment , at_parent = True )
623+
571624def simplify (segment , tolerance = 0 ):
572625 if tolerance <= 0 :
573626 return # nothing to do
@@ -634,22 +687,22 @@ def page_poly(page):
634687# same as polygon_for_parent pattern in other processors
635688def ensure_consistent (child , at_parent = False ):
636689 """Make segment coordinates fit into parent coordinates.
637-
690+
638691 Ensure that the coordinate polygon of ``child`` is fully
639692 contained in the coordinate polygon of its parent.
640-
693+
641694 \b
642695 To achieve that when necessary, either
643696 - enlarge the parent to the union of both,
644697 if ``at_parent``
645698 - shrink the child to the intersection of both,
646699 otherwise.
647-
700+
648701 In any case, ensure the resulting polygon is valid.
649-
702+
650703 If the parent is at page level, and there is no Border,
651704 then use the page frame (and assume `at_parent=False`).
652-
705+
653706 If ``child`` is at page level, and there is a Border,
654707 then use the page frame as parent (and assume `at_parent=False`).
655708 """
@@ -660,6 +713,11 @@ def ensure_consistent(child, at_parent=False):
660713 parentp = page_poly (child )
661714 at_parent = False # clip to page frame
662715 parent = child
716+ elif isinstance (child , BorderType ):
717+ childp = Polygon (polygon_from_points (child .get_Coords ().points ))
718+ parentp = page_poly (child .parent_object_ )
719+ at_parent = False # clip to page frame
720+ parent = child .parent_object_
663721 else :
664722 points = child .get_Coords ().points
665723 polygon = polygon_from_points (points )
0 commit comments