Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 40 additions & 17 deletions docling/backend/latex/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import threading
from io import BytesIO
from pathlib import Path
from typing import Union
from typing import Any, Union

from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem
from docling_core.types.doc.document import Formatting
Expand All @@ -13,7 +13,9 @@
LatexMacroNode,
LatexMathNode,
LatexWalker,
get_default_latex_context_db,
)
from pylatexenc.macrospec import LatexContextDb, MacroSpec

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.latex.handlers.environments import EnvironmentHandlerMixin
Expand Down Expand Up @@ -45,8 +47,7 @@ def __init__(
):
super().__init__(in_doc, path_or_stream, options)
self.labels: dict[str, bool] = {}
self._custom_macros: dict[str, str] = {}
self._custom_macro_num_args: dict[str, int] = {}
self._custom_macros: dict[str, Any] = {}
self._input_stack: set[str] = set()
self.latex_text = decode_latex_content(self.path_or_stream)

Expand Down Expand Up @@ -78,14 +79,27 @@ def _do_parse_and_process(self, doc: DoclingDocument) -> DoclingDocument:

try:
self._extract_custom_macros(nodes)
self._extract_preamble_metadata(nodes, doc)
self.context_db = get_default_latex_context_db()
new_specs = [
MacroSpec(m_name, "{" * item["num_args"])
for m_name, item in self._custom_macros.items()
]
self.context_db.add_context_category("custom_user_macros", macros=new_specs)

new_walker = LatexWalker(
preprocessed_text, latex_context=self.context_db, tolerant_parsing=True
)
new_nodes, _, _ = new_walker.get_latex_nodes()

self._extract_preamble_metadata(new_nodes, doc)

doc_node = self._find_document_env(nodes)
self.in_macro = False
doc_node = self._find_document_env(new_nodes)

if doc_node:
self._process_nodes(doc_node.nodelist, doc)
else:
self._process_nodes(nodes, doc)
self._process_nodes(new_nodes, doc)

except Exception as e:
_log.error(f"Error processing LaTeX nodes: {e}")
Expand Down Expand Up @@ -139,11 +153,13 @@ def _process_nodes(
parent: NodeItem | None = None,
formatting: Formatting | None = None,
text_label: DocItemLabel | None = None,
text_buffer: list[str] | None = None,
):
if nodes is None:
return

text_buffer: list[str] = []
if text_buffer is None:
text_buffer = []

def flush_text_buffer():
if text_buffer:
Expand All @@ -157,10 +173,7 @@ def flush_text_buffer():
)
text_buffer.clear()

idx = 0
while idx < len(nodes):
node = nodes[idx]
consumed_following = 0
for node in nodes:
try:
if isinstance(node, LatexCharsNode):
self._process_chars_node(
Expand All @@ -174,24 +187,33 @@ def flush_text_buffer():
)

elif isinstance(node, LatexMacroNode):
consumed_following = self._process_macro_node_inline(
self._process_macro_node_inline(
node,
doc,
parent,
formatting,
text_label,
text_buffer,
flush_text_buffer,
nodes[idx + 1 :],
)

elif isinstance(node, LatexEnvironmentNode):
flush_text_buffer()
self._process_environment(node, doc, parent, formatting, text_label)
self._process_environment(
node,
doc,
parent,
formatting,
text_label,
)

elif isinstance(node, LatexMathNode):
self._process_math_node(
node, doc, parent, text_buffer, flush_text_buffer
node,
doc,
parent,
text_buffer,
flush_text_buffer,
)

elif isinstance(node, LatexGroupNode):
Expand All @@ -207,6 +229,7 @@ def flush_text_buffer():

except Exception as e:
_log.warning(f"Failed to process node {type(node).__name__}: {e}")
idx += 1 + consumed_following
continue

flush_text_buffer()
if not self.in_macro:
flush_text_buffer()
16 changes: 14 additions & 2 deletions docling/backend/latex/handlers/environments.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
NodeItem,
)
from pylatexenc.latexwalker import LatexEnvironmentNode, LatexMacroNode
from pylatexenc.macrospec import LatexContextDb

from docling.backend.latex.constants import ENV_LIST, ENV_MATH, ENV_QUOTE, ENV_THEOREM

Expand All @@ -25,7 +26,17 @@ def _process_nodes(
parent: "Any" = ...,
formatting: "Any" = ...,
text_label: "Any" = ...,
text_buffer: "Any" = ...,
) -> None: ...
def _process_math_node(
self,
node: "Any",
doc: "Any",
parent: "Any",
text_buffer: "Any",
flush_fn: "Any",
) -> None: ...
def _expand_custom_macros(self, node: Any, depth: int = 0) -> str: ...
def _clean_math(self, latex_str: str, env_name: str) -> str: ...
def _parse_table(self, node: "Any") -> "Any": ...
def _extract_verbatim_content(self, latex_str: str, env_name: str) -> str: ...
Expand Down Expand Up @@ -67,11 +78,12 @@ def _process_environment(
self._process_nodes(node.nodelist, doc, parent, formatting, text_label)

elif node.envname.replace("*", "") in ENV_MATH:
math_text = self._clean_math(node.latex_verbatim(), node.envname)
math_text = self._expand_custom_macros(node)
math_text = self._clean_math(math_text, node.envname)
doc.add_text(parent=parent, label=DocItemLabel.FORMULA, text=math_text)

elif node.envname == "math":
math_text = self._clean_math(node.latex_verbatim(), node.envname)
math_text = self._expand_custom_macros(node)
doc.add_text(parent=parent, label=DocItemLabel.FORMULA, text=math_text)

elif node.envname == "subequations":
Expand Down
Loading
Loading