33from copy import deepcopy
44from io import BytesIO
55from pathlib import Path
6- from typing import Any , Callable , Final , Optional , Union
6+ from typing import Any , Callable , Final , Union
77from urllib .parse import urlparse
88
99from docling_core .types .doc import (
@@ -82,13 +82,13 @@ def __init__(
8282 self .valid : bool = False
8383 # Initialise the parents for the hierarchy
8484 self .max_levels : int = 10
85- self .level_at_new_list : Optional [ int ] = None
86- self .parents : dict [int , Optional [ NodeItem ] ] = {}
85+ self .level_at_new_list : int | None = None
86+ self .parents : dict [int , NodeItem | None ] = {}
8787 self .numbered_headers : dict [int , int ] = {}
8888 self .equation_bookends : str = "<eq>{EQ}</eq>"
8989 # Track processed textbox elements to avoid duplication
9090 self .processed_textbox_elements : list [int ] = []
91- self .docx_to_pdf_converter : Optional [ Callable ] = None
91+ self .docx_to_pdf_converter : Callable | None = None
9292 self .docx_to_pdf_converter_init = False
9393 self .display_drawingml_warning = True
9494
@@ -195,26 +195,26 @@ def load_msword_file(
195195 def _update_history (
196196 self ,
197197 name : str ,
198- level : Optional [ int ] ,
199- numid : Optional [ int ] ,
200- ilevel : Optional [ int ] ,
198+ level : int | None ,
199+ numid : int | None ,
200+ ilevel : int | None ,
201201 ):
202202 self .history ["names" ].append (name )
203203 self .history ["levels" ].append (level )
204204
205205 self .history ["numids" ].append (numid )
206206 self .history ["indents" ].append (ilevel )
207207
208- def _prev_name (self ) -> Optional [ str ] :
208+ def _prev_name (self ) -> str | None :
209209 return self .history ["names" ][- 1 ]
210210
211- def _prev_level (self ) -> Optional [ int ] :
211+ def _prev_level (self ) -> int | None :
212212 return self .history ["levels" ][- 1 ]
213213
214- def _prev_numid (self ) -> Optional [ int ] :
214+ def _prev_numid (self ) -> int | None :
215215 return self .history ["numids" ][- 1 ]
216216
217- def _prev_indent (self ) -> Optional [ int ] :
217+ def _prev_indent (self ) -> int | None :
218218 return self .history ["indents" ][- 1 ]
219219
220220 def _get_level (self ) -> int :
@@ -366,9 +366,7 @@ def _walk_linear(
366366
367367 return doc , added_elements
368368
369- def _str_to_int (
370- self , s : Optional [str ], default : Optional [int ] = 0
371- ) -> Optional [int ]:
369+ def _str_to_int (self , s : str | None , default : int | None = 0 ) -> int | None :
372370 if s is None :
373371 return None
374372 try :
@@ -386,7 +384,7 @@ def _split_text_and_number(self, input_string: str) -> list[str]:
386384
387385 def _get_numId_and_ilvl (
388386 self , paragraph : Paragraph
389- ) -> tuple [Optional [ int ], Optional [ int ] ]:
387+ ) -> tuple [int | None , int | None ]:
390388 # Access the XML element of the paragraph
391389 numPr = paragraph ._element .find (
392390 ".//w:numPr" , namespaces = paragraph ._element .nsmap
@@ -559,7 +557,7 @@ def _build_multi_level_marker(
559557
560558 return marker
561559
562- def _get_outline_level_from_style (self , paragraph : Paragraph ) -> Optional [ int ] :
560+ def _get_outline_level_from_style (self , paragraph : Paragraph ) -> int | None :
563561 """Extract outlineLvl from paragraph's style definition.
564562
565563 In OOXML, outlineLvl is 0-indexed (0-8 for heading levels 1-9).
@@ -584,13 +582,13 @@ def _get_outline_level_from_style(self, paragraph: Paragraph) -> Optional[int]:
584582 pass
585583 return None
586584
587- def _get_heading_and_level (self , style_label : str ) -> tuple [str , Optional [ int ] ]:
585+ def _get_heading_and_level (self , style_label : str ) -> tuple [str , int | None ]:
588586 parts = self ._split_text_and_number (style_label )
589587
590588 if len (parts ) == 2 :
591589 parts .sort ()
592590 label_str : str = ""
593- label_level : Optional [ int ] = 0
591+ label_level : int | None = 0
594592 if parts [0 ].strip ().lower () == "heading" :
595593 label_str = "Heading"
596594 label_level = self ._str_to_int (parts [1 ], None )
@@ -604,14 +602,14 @@ def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
604602
605603 return style_label , None
606604
607- def _get_label_and_level (self , paragraph : Paragraph ) -> tuple [str , Optional [ int ] ]:
605+ def _get_label_and_level (self , paragraph : Paragraph ) -> tuple [str , int | None ]:
608606 if paragraph .style is None :
609607 return "Normal" , None
610608
611609 label : str = paragraph .style .style_id
612610 name : str = paragraph .style .name or ""
613- base_style_label : Optional [ str ] = None
614- base_style_name : Optional [ str ] = None
611+ base_style_label : str | None = None
612+ base_style_name : str | None = None
615613 if isinstance (
616614 base_style := getattr (paragraph .style , "base_style" , None ), ParagraphStyle
617615 ):
@@ -653,7 +651,7 @@ def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]
653651 return label , None
654652
655653 @classmethod
656- def _get_format_from_run (cls , run : Run ) -> Optional [ Formatting ] :
654+ def _get_format_from_run (cls , run : Run ) -> Formatting | None :
657655 # The .bold and .italic properties are booleans, but .underline can be an enum
658656 # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
659657 is_bold = run .bold or False
@@ -683,7 +681,7 @@ def _get_paragraph_elements(self, paragraph: Paragraph):
683681 return [("" , None , None )]
684682
685683 paragraph_elements : list [
686- tuple [str , Optional [ Formatting ], Optional [ Union [AnyUrl , Path ]] ]
684+ tuple [str , Formatting | None , Union [AnyUrl , Path ] | None ]
687685 ] = []
688686 group_text = ""
689687 previous_format = None
@@ -990,9 +988,9 @@ def _create_or_reuse_parent(
990988 self ,
991989 * ,
992990 doc : DoclingDocument ,
993- prev_parent : Optional [ NodeItem ] ,
991+ prev_parent : NodeItem | None ,
994992 paragraph_elements : list ,
995- ) -> Optional [ NodeItem ] :
993+ ) -> NodeItem | None :
996994 return (
997995 doc .add_inline_group (parent = prev_parent , content_layer = self .content_layer )
998996 if len (paragraph_elements ) > 1
@@ -1200,7 +1198,7 @@ def _handle_text_elements(
12001198 def _add_heading (
12011199 self ,
12021200 doc : DoclingDocument ,
1203- curr_level : Optional [ int ] ,
1201+ curr_level : int | None ,
12041202 text : str ,
12051203 is_numbered_style : bool = False ,
12061204 ) -> list [RefItem ]:
@@ -1499,7 +1497,7 @@ def _handle_tables(
14991497 cell_set .add (cell ._tc )
15001498
15011499 spanned_idx = row_idx
1502- spanned_tc : Optional [ CT_Tc ] = cell ._tc
1500+ spanned_tc : CT_Tc | None = cell ._tc
15031501 while spanned_tc == cell ._tc :
15041502 spanned_idx += 1
15051503 spanned_tc = (
@@ -1640,8 +1638,8 @@ def _is_rich_table_cell(self, cell: _Cell) -> bool:
16401638 def _handle_pictures (
16411639 self , drawing_blip : Any , doc : DoclingDocument
16421640 ) -> list [RefItem ]:
1643- def get_docx_image (image : Any ) -> Optional [ bytes ] :
1644- image_data : Optional [ bytes ] = None
1641+ def get_docx_image (image : Any ) -> bytes | None :
1642+ image_data : bytes | None = None
16451643 rId = image .get (
16461644 "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
16471645 )
@@ -1655,7 +1653,7 @@ def get_docx_image(image: Any) -> Optional[bytes]:
16551653 if drawing_blip :
16561654 level = self ._get_level ()
16571655 # Open the BytesIO object with PIL to create an Image
1658- parent : Optional [ NodeItem ] = (
1656+ parent : NodeItem | None = (
16591657 self .parents [level - 1 ]
16601658 if len (drawing_blip ) == 1
16611659 else doc .add_group (
@@ -1665,7 +1663,7 @@ def get_docx_image(image: Any) -> Optional[bytes]:
16651663 )
16661664 )
16671665 for image in drawing_blip :
1668- image_data : Optional [ bytes ] = get_docx_image (image )
1666+ image_data : bytes | None = get_docx_image (image )
16691667 if image_data is None :
16701668 _log .warning ("Warning: image cannot be found" )
16711669 p1 = doc .add_picture (
0 commit comments