Skip to content

Commit 3612ac2

Browse files
rootBr1an67
authored andcommitted
fix(docx): preserve multi-level outline numbering from lvlText
Fixed a bug where multi-level outline numbering (e.g., "3.1", "3.2") was rendered as single-level only (e.g., "1", "2", "3"), losing the hierarchical structure. Changes: - Added _get_level_element() to extract level elements from numbering XML - Refactored _is_numbered_list() to use the new helper function - Added _get_level_text() to read the w:lvlText format string (e.g., "%1.%2") - Added _build_multi_level_marker() to construct multi-level markers by substituting placeholders (%1, %2, etc.) with actual counter values - Updated all marker generation in _add_list_item() to use the new function The fix properly handles Word's outline numbering format strings, which define how multi-level numbers should be constructed (e.g., "%1.%2" produces "3.1").
1 parent 3d90778 commit 3612ac2

1 file changed

Lines changed: 75 additions & 17 deletions

File tree

docling/backend/msword_backend.py

Lines changed: 75 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -418,14 +418,14 @@ def _reset_list_counters_for_new_sequence(self, numid: int):
418418
for key in keys_to_reset:
419419
self.list_counters[key] = 0
420420

421-
def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
422-
"""Check if a list is numbered based on its numFmt value."""
421+
def _get_level_element(self, numId: int, ilvl: int):
422+
"""Get the level element from numbering XML for a given numId and ilvl."""
423423
try:
424424
# Access the numbering part of the document
425425
if not hasattr(self.docx_obj, "part") or not hasattr(
426426
self.docx_obj.part, "package"
427427
):
428-
return False
428+
return None
429429

430430
numbering_part = None
431431
# Find the numbering part
@@ -435,7 +435,7 @@ def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
435435
break
436436

437437
if numbering_part is None:
438-
return False
438+
return None
439439

440440
# Parse the numbering XML
441441
numbering_root = numbering_part.element
@@ -446,18 +446,18 @@ def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
446446
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
447447

448448
if num_element is None:
449-
return False
449+
return None
450450

451451
# Get the abstractNumId from the num element
452452
abstract_num_id_elem = num_element.find(
453453
".//w:abstractNumId", namespaces=namespaces
454454
)
455455
if abstract_num_id_elem is None:
456-
return False
456+
return None
457457

458458
abstract_num_id = abstract_num_id_elem.get(f"{self._W_NS_CLARK}val")
459459
if abstract_num_id is None:
460-
return False
460+
return None
461461

462462
# Find the abstract numbering definition
463463
abstract_num_xpath = (
@@ -468,17 +468,29 @@ def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
468468
)
469469

470470
if abstract_num_element is None:
471-
return False
471+
return None
472472

473473
# Find the level definition for the given ilvl
474474
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
475475
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
476476

477+
return lvl_element
478+
479+
except Exception as e:
480+
_log.debug(f"Error getting level element: {e}")
481+
return None
482+
483+
def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
484+
"""Check if a list is numbered based on its numFmt value."""
485+
try:
486+
lvl_element = self._get_level_element(numId, ilvl)
477487
if lvl_element is None:
478488
return False
479489

480490
# Get the numFmt element
481-
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
491+
num_fmt_element = lvl_element.find(
492+
".//w:numFmt", namespaces={"w": self._W_NS}
493+
)
482494
if num_fmt_element is None:
483495
return False
484496

@@ -501,6 +513,52 @@ def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
501513
_log.debug(f"Error determining if list is numbered: {e}")
502514
return False
503515

516+
def _get_level_text(self, numId: int, ilvl: int) -> str | None:
517+
"""Get the level text format (e.g., '%1.%2') for multi-level numbering."""
518+
try:
519+
lvl_element = self._get_level_element(numId, ilvl)
520+
if lvl_element is None:
521+
return None
522+
523+
# Get the lvlText element
524+
lvl_text_element = lvl_element.find(
525+
".//w:lvlText", namespaces={"w": self._W_NS}
526+
)
527+
if lvl_text_element is None:
528+
return None
529+
530+
lvl_text = lvl_text_element.get(f"{self._W_NS_CLARK}val")
531+
return lvl_text
532+
533+
except Exception as e:
534+
_log.debug(f"Error getting level text: {e}")
535+
return None
536+
537+
def _build_multi_level_marker(
538+
self, numid: int, ilevel: int, lvl_text: str | None
539+
) -> str:
540+
"""Build a multi-level marker from lvlText format like '%1.%2'."""
541+
if lvl_text is None:
542+
# Fallback to simple counter if lvlText not found
543+
counter = self._get_list_counter(numid, ilevel)
544+
return str(counter) + "."
545+
546+
# Replace placeholders like %1, %2, %3 with actual counter values
547+
# %1 = level 0 counter, %2 = level 1 counter, etc.
548+
marker = lvl_text
549+
for level in range(ilevel + 1):
550+
placeholder = f"%{level + 1}"
551+
if placeholder in marker:
552+
counter = self.list_counters.get((numid, level), 0)
553+
marker = marker.replace(placeholder, str(counter))
554+
555+
# Add a trailing period if the lvlText doesn't already end with one
556+
# and the marker doesn't already have punctuation
557+
if marker and not marker.endswith((".", ")", ":", "-")):
558+
marker += "."
559+
560+
return marker
561+
504562
def _get_outline_level_from_style(self, paragraph: Paragraph) -> Optional[int]:
505563
"""Extract outlineLvl from paragraph's style definition.
506564
@@ -1296,8 +1354,8 @@ def _add_list_item(
12961354

12971355
# Set marker and enumerated arguments if this is an enumeration element.
12981356
if is_numbered:
1299-
counter = self._get_list_counter(numid, ilevel)
1300-
enum_marker = str(counter) + "."
1357+
lvl_text = self._get_level_text(numid, ilevel)
1358+
enum_marker = self._build_multi_level_marker(numid, ilevel, lvl_text)
13011359
else:
13021360
enum_marker = ""
13031361
self._add_formatted_list_item(
@@ -1323,8 +1381,8 @@ def _add_list_item(
13231381

13241382
# TODO: Set marker and enumerated arguments if this is an enumeration element.
13251383
if is_numbered:
1326-
counter = self._get_list_counter(numid, ilevel)
1327-
enum_marker = str(counter) + "."
1384+
lvl_text = self._get_level_text(numid, ilevel)
1385+
enum_marker = self._build_multi_level_marker(numid, ilevel, lvl_text)
13281386
else:
13291387
enum_marker = ""
13301388
self._add_formatted_list_item(
@@ -1346,8 +1404,8 @@ def _add_list_item(
13461404

13471405
# TODO: Set marker and enumerated arguments if this is an enumeration element.
13481406
if is_numbered:
1349-
counter = self._get_list_counter(numid, ilevel)
1350-
enum_marker = str(counter) + "."
1407+
lvl_text = self._get_level_text(numid, ilevel)
1408+
enum_marker = self._build_multi_level_marker(numid, ilevel, lvl_text)
13511409
else:
13521410
enum_marker = ""
13531411
self._add_formatted_list_item(
@@ -1361,8 +1419,8 @@ def _add_list_item(
13611419
elif self._prev_numid() == numid or prev_indent == ilevel:
13621420
# Set marker and enumerated arguments if this is an enumeration element.
13631421
if is_numbered:
1364-
counter = self._get_list_counter(numid, ilevel)
1365-
enum_marker = str(counter) + "."
1422+
lvl_text = self._get_level_text(numid, ilevel)
1423+
enum_marker = self._build_multi_level_marker(numid, ilevel, lvl_text)
13661424
else:
13671425
enum_marker = ""
13681426
self._add_formatted_list_item(

0 commit comments

Comments
 (0)