diff --git a/doc-maker/config.json b/doc-maker/config.json index 68d73a2b..22ec9fb8 100644 --- a/doc-maker/config.json +++ b/doc-maker/config.json @@ -1,6 +1,6 @@ { "name": "Doc Maker", - "version": "2.0.0", + "version": "3.0.0", "description": "Word document automation integration using python-docx with markdown-first content creation. AI agents should use markdown syntax for most content creation - headings (#), paragraphs, lists (- or 1.), formatting (**bold**, *italic*), tables, blockquotes (>), and code blocks. Only use non-markdown actions for images, page breaks, or when you need direct table creation with structured data arrays.", "entry_point": "doc_maker.py", "actions": { diff --git a/doc-maker/doc_maker.py b/doc-maker/doc_maker.py index f9fbe4ab..8b7c71a4 100644 --- a/doc-maker/doc_maker.py +++ b/doc-maker/doc_maker.py @@ -10,6 +10,8 @@ from docx.shared import Inches from docx.enum.text import WD_BREAK from docx.document import Document as _Document +from docx.oxml import OxmlElement +from docx.oxml.ns import qn from docx.oxml.text.paragraph import CT_P from docx.oxml.table import CT_Tbl from docx.table import _Cell, Table @@ -35,9 +37,13 @@ def process_files(files: List[Dict[str, Any]]) -> Dict[str, BytesIO]: for file_item in files: content_as_string = file_item["content"] - padded_content_string = content_as_string + "=" * (-len(content_as_string) % 4) + padded_content_string = content_as_string + "=" * ( + -len(content_as_string) % 4 + ) - file_binary_data = base64.urlsafe_b64decode(padded_content_string.encode("ascii")) + file_binary_data = base64.urlsafe_b64decode( + padded_content_string.encode("ascii") + ) file_stream = BytesIO(file_binary_data) processed_files[file_item["name"]] = file_stream @@ -66,7 +72,9 @@ def load_document_from_files(document_id: str, files: List[Dict[str, Any]]) -> N "Files may be corrupted or not Word format." ) elif document_id not in documents: - raise ValueError(f"Document {document_id} not found and no files provided for loading") + raise ValueError( + f"Document {document_id} not found and no files provided for loading" + ) def _save_document_to_dict(document_id: str, file_path: str) -> Dict[str, Any]: @@ -269,7 +277,9 @@ def detect_placeholder_patterns(text: str) -> tuple[bool, str]: "cost", "description", ] - if len(original_text) < 20 and any(word in text_lower for word in business_keywords): + if len(original_text) < 20 and any( + word in text_lower for word in business_keywords + ): return True, "short_business" # Numbers and currency that look like placeholders @@ -302,7 +312,9 @@ def parse_and_apply_markdown_formatting(target, text: str): paragraph.clear() elif hasattr(target, "paragraphs"): # This is a table cell target.text = "" # Clear cell - paragraph = target.paragraphs[0] if target.paragraphs else target.add_paragraph() + paragraph = ( + target.paragraphs[0] if target.paragraphs else target.add_paragraph() + ) else: raise ValueError("Target must be a paragraph or table cell") @@ -348,10 +360,14 @@ def parse_and_apply_markdown_formatting(target, text: str): if earliest_match: # Add text before the match as normal text if earliest_pos > 0: - processed_parts.append({"text": remaining_text[:earliest_pos], "formatting": {}}) + processed_parts.append( + {"text": remaining_text[:earliest_pos], "formatting": {}} + ) # Add the formatted text - processed_parts.append({"text": earliest_match.group(1), "formatting": earliest_pattern}) + processed_parts.append( + {"text": earliest_match.group(1), "formatting": earliest_pattern} + ) # Continue with text after the match remaining_text = remaining_text[earliest_match.end() :] @@ -402,7 +418,9 @@ def is_likely_placeholder_context(text: str, find_word: str) -> bool: # Surrounded by placeholder indicators placeholder_indicators = ["{", "}", "[", "]", "_", "-", ".", "(", ")"] text_around = text.replace(find_word, "").strip() - if len(text_around) < 10 and any(indicator in text_around for indicator in placeholder_indicators): + if len(text_around) < 10 and any( + indicator in text_around for indicator in placeholder_indicators + ): return True # In obvious placeholder phrases @@ -440,7 +458,9 @@ def analyze_replacement_safety(find_text: str, matches_found: list) -> dict: alternatives = [] if len(safe_matches) > 0 and len(unsafe_matches) > 0: - guidance.append(f"Found {len(safe_matches)} safe placeholders and {len(unsafe_matches)} content text matches") + guidance.append( + f"Found {len(safe_matches)} safe placeholders and {len(unsafe_matches)} content text matches" + ) # Suggest safer alternatives based on actual safe matches safe_contexts = [] @@ -453,16 +473,24 @@ def analyze_replacement_safety(find_text: str, matches_found: list) -> dict: safe_contexts.append(safer_phrase.strip()) if safe_contexts: - alternatives.extend([f"Use '{ctx}' to target form fields" for ctx in safe_contexts[:2]]) + alternatives.extend( + [f"Use '{ctx}' to target form fields" for ctx in safe_contexts[:2]] + ) elif len(unsafe_matches) > 0: - guidance.append(f"All {len(unsafe_matches)} matches appear to be in content text - very risky") + guidance.append( + f"All {len(unsafe_matches)} matches appear to be in content text - very risky" + ) alternatives.append("Use position updates instead of text replacement") elif len(safe_matches) > 0: - guidance.append(f"All {len(safe_matches)} matches appear to be placeholders - relatively safe") + guidance.append( + f"All {len(safe_matches)} matches appear to be placeholders - relatively safe" + ) if len(safe_matches) > 1: - alternatives.append(f"Add replace_all=true to confirm you want all {len(safe_matches)} instances replaced") + alternatives.append( + f"Add replace_all=true to confirm you want all {len(safe_matches)} instances replaced" + ) return { "safety_level": "high_risk" @@ -479,7 +507,9 @@ def analyze_replacement_safety(find_text: str, matches_found: list) -> dict: "location": f"P{match['index']}" if match["type"] == "paragraph" else f"T{match['table_index']}R{match['row']}C{match['col']}", - "context": match["content"][:50] + "..." if len(match["content"]) > 50 else match["content"], + "context": match["content"][:50] + "..." + if len(match["content"]) > 50 + else match["content"], "safety": "SAFE" if match in safe_matches else "RISKY", } for match in matches_found[:5] # Show first 5 matches @@ -539,9 +569,13 @@ def analyze_document_structure(doc: Document) -> dict: element_index += 1 # Summary statistics - fillable_paragraphs = len([e for e in elements if e["type"] == "paragraph" and e["is_fillable"]]) + fillable_paragraphs = len( + [e for e in elements if e["type"] == "paragraph" and e["is_fillable"]] + ) fillable_cells = sum( - len([c for c in e.get("cells", []) if c["is_fillable"]]) for e in elements if e["type"] == "table" + len([c for c in e.get("cells", []) if c["is_fillable"]]) + for e in elements + if e["type"] == "table" ) return { @@ -554,12 +588,480 @@ def analyze_document_structure(doc: Document) -> dict: } +_PAREN_ITEM_RE = re.compile( + r"\((" + r"\d+" # (1), (2), … + r"|[a-z]" # (a), (b), … + r"|(?:i{1,3}|iv|vi{0,3}|ix|x{1,3}|xi{1,2})" # (i), (ii), … + r")\)\s+", + re.IGNORECASE, +) + +_ROMAN_VALS = { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 4, + "v": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9, + "x": 10, + "xi": 11, + "xii": 12, +} + + +def _detect_paren_type(marker: str) -> tuple[str, int]: + """Given the text between parens (e.g. 'a', 'ii', '3'), return (ol_type, start_val).""" + low = marker.lower() + if low in _ROMAN_VALS: + return "i", _ROMAN_VALS[low] + if low.isalpha() and len(low) == 1: + ol_type = "A" if marker.isupper() else "a" + return ol_type, ord(low) - ord("a") + 1 + if low.isdigit(): + return "1", int(low) + return "1", 1 + + +def _post_process_paren_lists(soup) -> None: + """Walk the soup and convert parenthesized numbering in text into nested
    elements. + + After the markdown parser runs, ``(a) text`` patterns appear as plain text + inside ``
  1. `` or ``

    `` elements. This function finds those patterns + and restructures the HTML so that ``_add_list_items`` sees proper nested + ``

      `` elements with ``type`` and ``data-paren`` attributes. + """ + from bs4 import NavigableString + + # Process
    1. elements that contain inline (a)/(1)/(i) patterns + for li in list(soup.find_all("li")): + # Get the raw text content of this li (may span multiple NavigableStrings) + full_text = li.get_text() + if not _PAREN_ITEM_RE.search(full_text): + continue + + # Split text into the leading part (before the first marker) and the list items + lines = full_text.split("\n") + leading_lines: list[str] = [] + list_items: list[ + tuple[str, int, str, int] + ] = [] # (type, start_val, text, indent_spaces) + + for line in lines: + stripped = line.strip() + if not stripped: + continue + m = _PAREN_ITEM_RE.match(stripped) + if m: + ol_type, start_val = _detect_paren_type(m.group(1)) + item_text = stripped[m.end() :] + indent_spaces = len(line) - len(line.lstrip()) + list_items.append((ol_type, start_val, item_text, indent_spaces)) + else: + if not list_items: + leading_lines.append(stripped) + else: + # Continuation text for the last list item + last = list_items[-1] + list_items[-1] = ( + last[0], + last[1], + last[2] + " " + stripped, + last[3], + ) + + if not list_items: + continue + + # Rebuild the
    2. contents + li.clear() + if leading_lines: + li.append(NavigableString(" ".join(leading_lines))) + + # Group consecutive items by type and build
        elements + current_type = None + current_ol = None + for ol_type, start_val, item_text, indent_spaces in list_items: + if ol_type != current_type: + current_type = ol_type + indent_level = indent_spaces // 4 + current_ol = soup.new_tag( + "ol", + attrs={ + "type": ol_type, + "data-paren": "true", + "data-indent-level": str(indent_level), + }, + ) + if start_val != 1: + current_ol["start"] = str(start_val) + li.append(current_ol) + new_li = soup.new_tag("li") + new_li.string = item_text + current_ol.append(new_li) + + # Also handle standalone

        elements with (a)/(1)/(i) patterns (not inside a list) + for p in list(soup.find_all("p", recursive=False)): + full_text = p.get_text() + if not _PAREN_ITEM_RE.search(full_text): + continue + + lines = full_text.split("\n") + list_items: list[tuple[str, int, str, int]] = [] + for line in lines: + stripped = line.strip() + if not stripped: + continue + m = _PAREN_ITEM_RE.match(stripped) + if m: + ol_type, start_val = _detect_paren_type(m.group(1)) + indent_spaces = len(line) - len(line.lstrip()) + list_items.append( + (ol_type, start_val, stripped[m.end() :], indent_spaces) + ) + if not list_items: + continue + + current_type = None + current_ol = None + for ol_type, start_val, item_text, indent_spaces in list_items: + if ol_type != current_type: + current_type = ol_type + indent_level = indent_spaces // 4 + current_ol = soup.new_tag( + "ol", + attrs={ + "type": ol_type, + "data-paren": "true", + "data-indent-level": str(indent_level), + }, + ) + if start_val != 1: + current_ol["start"] = str(start_val) + p.insert_before(current_ol) + new_li = soup.new_tag("li") + new_li.string = item_text + current_ol.append(new_li) + p.decompose() + + +# --------------------------------------------------------------------------- +# Low-level OOXML numbering helpers +# --------------------------------------------------------------------------- + + +def _numbering_root(doc): + """Return the root element, creating the numbering part if needed.""" + try: + return doc.part.numbering_part._element + except Exception: + # No numbering part yet – force creation by adding and removing a list paragraph + dummy = doc.add_paragraph("", style="List Number") + dummy._element.getparent().remove(dummy._element) + return doc.part.numbering_part._element + + +def _next_abstract_num_id(numbering) -> int: + ids = [ + int(el.get(qn("w:abstractNumId"))) + for el in numbering.findall(qn("w:abstractNum")) + ] + return max(ids, default=-1) + 1 + + +def _next_num_id(numbering) -> int: + ids = [int(el.get(qn("w:numId"))) for el in numbering.findall(qn("w:num"))] + return max(ids, default=0) + 1 + + +_LIST_HANGING_INDENT = 504 +"""Hanging indent in twips used for every numbered-list level. + +A single value is used for **all** numbering formats so that item text aligns +at the same column across lists that use different label styles (e.g. ``1.``, +``(a)``, ``(iii)``). 504 twips (≈ 0.35 in) is wide enough for the widest +common parenthesized roman label ``(viii)`` while keeping the indent compact. +""" + + +def _get_or_create_abstract_num( + doc, num_fmt: str, lvl_text: str, nesting_levels: int = 3, start: int = 1 +) -> int: + """Create an abstract numbering definition for the given format. + + Always creates a new definition so that each independent list gets its own + ``abstractNumId``. Sharing an abstract num across multiple ```` + elements can cause Word to silently drop numbering on some lists. + + Creates a multilevel abstract numbering so nested lists at different ilvl + values share a single definition with increasing indentation. + + *start* sets the ```` value for the first level (ilvl 0). + This ensures renderers that ignore ``/`` + still produce the correct numbering. + """ + numbering = _numbering_root(doc) + abstract_num_id = _next_abstract_num_id(numbering) + + abstract_num = OxmlElement("w:abstractNum") + abstract_num.set(qn("w:abstractNumId"), str(abstract_num_id)) + + multi_level_type = OxmlElement("w:multiLevelType") + multi_level_type.set(qn("w:val"), "multilevel") + abstract_num.append(multi_level_type) + + hanging = _LIST_HANGING_INDENT + + for ilvl in range(nesting_levels): + lvl = OxmlElement("w:lvl") + lvl.set(qn("w:ilvl"), str(ilvl)) + + start_el = OxmlElement("w:start") + start_el.set(qn("w:val"), str(start if ilvl == 0 else 1)) + lvl.append(start_el) + + fmt_el = OxmlElement("w:numFmt") + fmt_el.set(qn("w:val"), num_fmt) + lvl.append(fmt_el) + + # Use the ilvl+1 placeholder for each level (e.g. %1, %2, %3) + actual_lvl_text = lvl_text.replace("%1", f"%{ilvl + 1}") + text_el = OxmlElement("w:lvlText") + text_el.set(qn("w:val"), actual_lvl_text) + lvl.append(text_el) + + jc = OxmlElement("w:lvlJc") + jc.set(qn("w:val"), "left") + lvl.append(jc) + + # Force a tab character after the label so text aligns at the + # left-indent position regardless of label width. + suff = OxmlElement("w:suff") + suff.set(qn("w:val"), "tab") + lvl.append(suff) + + left = hanging + (hanging * ilvl) + ppr = OxmlElement("w:pPr") + ind = OxmlElement("w:ind") + ind.set(qn("w:left"), str(left)) + ind.set(qn("w:hanging"), str(hanging)) + ppr.append(ind) + + # Explicit tab stop at the text position so the tab after the + # label lands exactly at the left indent. + tabs = OxmlElement("w:tabs") + tab = OxmlElement("w:tab") + tab.set(qn("w:val"), "num") + tab.set(qn("w:pos"), str(left)) + tabs.append(tab) + ppr.append(tabs) + + lvl.append(ppr) + + abstract_num.append(lvl) + + # OOXML requires all elements before any . + # Insert before the first so Word doesn't silently ignore it. + first_num = numbering.find(qn("w:num")) + if first_num is not None: + first_num.addprevious(abstract_num) + else: + numbering.append(abstract_num) + return abstract_num_id + + +def _create_num( + doc, abstract_num_id: int, start_override: int | None = None, level: int = 0 +) -> int: + """Create a new referencing the given abstract numbering. + + If *start_override* is provided, a ```` element is added so + that numbering starts at the given value rather than continuing. + """ + numbering = _numbering_root(doc) + num_id = _next_num_id(numbering) + + num = OxmlElement("w:num") + num.set(qn("w:numId"), str(num_id)) + + abstract_ref = OxmlElement("w:abstractNumId") + abstract_ref.set(qn("w:val"), str(abstract_num_id)) + num.append(abstract_ref) + + if start_override is not None: + lvl_override = OxmlElement("w:lvlOverride") + lvl_override.set(qn("w:ilvl"), str(level)) + + start_el = OxmlElement("w:startOverride") + start_el.set(qn("w:val"), str(start_override)) + lvl_override.append(start_el) + + num.append(lvl_override) + + numbering.append(num) + return num_id + + +def _apply_numbering(paragraph, num_id: int, level: int = 0) -> None: + """Apply numbering properties to a paragraph at the given nesting level.""" + p_pr = paragraph._p.get_or_add_pPr() + + num_pr = p_pr.find(qn("w:numPr")) + if num_pr is None: + num_pr = OxmlElement("w:numPr") + p_pr.append(num_pr) + + ilvl = num_pr.find(qn("w:ilvl")) + if ilvl is None: + ilvl = OxmlElement("w:ilvl") + num_pr.append(ilvl) + ilvl.set(qn("w:val"), str(level)) + + num_id_el = num_pr.find(qn("w:numId")) + if num_id_el is None: + num_id_el = OxmlElement("w:numId") + num_pr.append(num_id_el) + num_id_el.set(qn("w:val"), str(num_id)) + + +def _patch_abstract_num_level( + doc, num_id: int, level: int, num_fmt: str, lvl_text: str +) -> None: + """Patch the abstractNum referenced by *num_id* so that *level* uses the given format. + + When a child list (e.g. ``(a)``) is nested under a parent list (e.g. ``1.``), + both must share the same ``numId``. This function updates the parent's + abstract numbering definition so that the child's ``ilvl`` has the correct + ``numFmt`` and ``lvlText``. + """ + numbering = _numbering_root(doc) + + # Find the for this numId and get its abstractNumId + abstract_num_id = None + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) == num_id: + abstract_num_id = int(num_el.find(qn("w:abstractNumId")).get(qn("w:val"))) + break + if abstract_num_id is None: + return + + # Find the abstractNum + abstract_num = None + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abstract_num_id: + abstract_num = an + break + if abstract_num is None: + return + + # Find or create the for this ilvl + target_lvl = None + for lvl in abstract_num.findall(qn("w:lvl")): + if int(lvl.get(qn("w:ilvl"))) == level: + target_lvl = lvl + break + + if target_lvl is None: + # Create a new level + target_lvl = OxmlElement("w:lvl") + target_lvl.set(qn("w:ilvl"), str(level)) + start_el = OxmlElement("w:start") + start_el.set(qn("w:val"), "1") + target_lvl.append(start_el) + abstract_num.append(target_lvl) + + # Update numFmt + fmt_el = target_lvl.find(qn("w:numFmt")) + if fmt_el is None: + fmt_el = OxmlElement("w:numFmt") + target_lvl.append(fmt_el) + fmt_el.set(qn("w:val"), num_fmt) + + # Update lvlText + actual_lvl_text = lvl_text.replace("%1", f"%{level + 1}") + txt_el = target_lvl.find(qn("w:lvlText")) + if txt_el is None: + txt_el = OxmlElement("w:lvlText") + target_lvl.append(txt_el) + txt_el.set(qn("w:val"), actual_lvl_text) + + # Ensure lvlJc exists + jc = target_lvl.find(qn("w:lvlJc")) + if jc is None: + jc = OxmlElement("w:lvlJc") + jc.set(qn("w:val"), "left") + target_lvl.append(jc) + + # Ensure suffix is tab-based for consistent text alignment + suff = target_lvl.find(qn("w:suff")) + if suff is None: + suff = OxmlElement("w:suff") + target_lvl.append(suff) + suff.set(qn("w:val"), "tab") + + # Ensure indentation + hanging = _LIST_HANGING_INDENT + left = hanging + (hanging * level) + ppr = target_lvl.find(qn("w:pPr")) + if ppr is None: + ppr = OxmlElement("w:pPr") + target_lvl.append(ppr) + ind = ppr.find(qn("w:ind")) + if ind is None: + ind = OxmlElement("w:ind") + ppr.append(ind) + ind.set(qn("w:left"), str(left)) + ind.set(qn("w:hanging"), str(hanging)) + + # Explicit tab stop at the text position + tabs = ppr.find(qn("w:tabs")) + if tabs is None: + tabs = OxmlElement("w:tabs") + ppr.append(tabs) + tab = OxmlElement("w:tab") + tab.set(qn("w:val"), "num") + tab.set(qn("w:pos"), str(left)) + tabs.append(tab) + + +def _ol_type_to_numfmt(type_attr: str | None, paren: bool = False) -> tuple[str, str]: + """Map HTML

          to (OOXML numFmt, lvlText). + + When *paren* is True the level text uses parenthesized form ``(%1)`` + for all types. Otherwise decimal uses ``%1.`` (standard ``1. 2. 3.``). + """ + type_attr = type_attr or "1" + fmt_map = { + "1": "decimal", + "a": "lowerLetter", + "A": "upperLetter", + "i": "lowerRoman", + } + num_fmt = fmt_map.get(type_attr, "decimal") + if paren or type_attr.lower() in ("a", "i"): + lvl_text = "(%1)" + else: + lvl_text = "%1." + return num_fmt, lvl_text + + def parse_markdown_to_docx(doc: Document, markdown_text: str) -> None: """Parse markdown text and add elements to Word document""" # Convert markdown to HTML - html = markdown.markdown(markdown_text, extensions=["tables", "fenced_code"]) + html = markdown.markdown( + markdown_text, extensions=["tables", "fenced_code", "sane_lists"] + ) soup = BeautifulSoup(html, "html.parser") + # Post-process: convert (a), (1), (i) text patterns into nested
            elements + _post_process_paren_lists(soup) + + # Track ordered list numbering state for restart/continue semantics + list_state: dict[str, Any] = {"ordered": {}} + # Process each HTML element in order for element in soup.find_all( [ @@ -575,10 +1077,13 @@ def parse_markdown_to_docx(doc: Document, markdown_text: str) -> None: "blockquote", "table", "pre", - ] + ], + recursive=False, ): if element.name.startswith("h"): - # Handle headings + # Handle headings – reset list continuation state so lists + # after a heading start fresh + list_state["ordered"] = {} level = int(element.name[1]) # Extract number from h1, h2, etc. text = element.get_text().strip() if text: @@ -590,15 +1095,8 @@ def parse_markdown_to_docx(doc: Document, markdown_text: str) -> None: _add_formatted_text_to_paragraph(paragraph, element) elif element.name in ["ul", "ol"]: - # Handle lists - is_numbered = element.name == "ol" - for li in element.find_all("li", recursive=False): - text = li.get_text().strip() - if text: - if is_numbered: - doc.add_paragraph(text, style="List Number") - else: - doc.add_paragraph(text, style="List Bullet") + # Handle lists (including nested) + _add_list_items(doc, element, level=0, list_state=list_state) elif element.name == "blockquote": # Handle blockquotes @@ -620,10 +1118,125 @@ def parse_markdown_to_docx(doc: Document, markdown_text: str) -> None: run.font.name = "Courier New" -def _add_formatted_text_to_paragraph(paragraph, html_element): +def _add_list_items( + doc: Document, + list_element, + level: int, + list_state: dict, + parent_num_id: int | None = None, +) -> None: + """Recursively add list items to Word document with proper nesting. + + For bullet lists, uses Word's built-in 'List Bullet' styles. + For ordered lists, creates low-level OOXML numbering definitions that + support custom formats (decimal, lowerLetter, lowerRoman) and proper + restart/continuation semantics. + + *parent_num_id* is passed when a child ordered list should share the + parent's numbering instance so that Word renders all nesting levels + under one coherent list. + """ + is_numbered = list_element.name == "ol" + + # Respect original markdown indentation via data-indent-level attribute. + # Every 4 leading spaces in the markdown source maps to one indent level. + # When data-indent-level is set it already encodes the absolute nesting + # depth relative to the top-level list, so we must NOT add `level` (which + # the recursive call already incremented) on top of it — that would + # double-count the nesting. + indent_level = int(list_element.get("data-indent-level", 0)) + effective_level = indent_level if indent_level > 0 else level + + num_id = None + if is_numbered: + start = int(list_element.get("start", 1)) + type_attr = list_element.get("type") or "1" + + paren = list_element.get("data-paren") == "true" + num_fmt, lvl_text = _ol_type_to_numfmt(type_attr, paren=paren) + + if parent_num_id is not None and level > 0: + # Child list: reuse parent numId but patch the abstractNum to + # have the correct format at this ilvl. + num_id = parent_num_id + _patch_abstract_num_level(doc, num_id, effective_level, num_fmt, lvl_text) + else: + abstract_num_id = _get_or_create_abstract_num( + doc, num_fmt, lvl_text, start=start + ) + + # Key for tracking continuation: lists at the same nesting level + # with the same format can continue numbering across boundaries + key = (effective_level, num_fmt, lvl_text) + + if start == 1: + num_id = _create_num( + doc, abstract_num_id, start_override=1, level=effective_level + ) + else: + num_id = list_state["ordered"].get(key) + if num_id is None: + num_id = _create_num( + doc, + abstract_num_id, + start_override=start, + level=effective_level, + ) + + list_state["ordered"][key] = num_id + + else: + clamped_level = min(effective_level, 2) + bullet_style = ( + "List Bullet" if clamped_level == 0 else f"List Bullet {clamped_level + 1}" + ) + + for li in list_element.find_all("li", recursive=False): + # Collect direct text of this
          1. , ignoring nested
              /
                + text_parts = [] + for child in li.children: + if hasattr(child, "name") and child.name in ("ul", "ol"): + continue + text_parts.append( + child.get_text() if hasattr(child, "get_text") else str(child) + ) + text = "".join(text_parts).strip() + + if text: + if is_numbered: + p = doc.add_paragraph() + p.style = doc.styles["List Paragraph"] + _apply_numbering(p, num_id=num_id, level=effective_level) + _add_formatted_text_to_paragraph(p, li, skip_nested_lists=True) + else: + doc.add_paragraph(text, style=bullet_style) + + # Recurse into nested
                  or
                    (direct children of this
                  1. ) + # Nested ordered lists inherit the parent numId so Word keeps them + # under one coherent multilevel numbering instance. + effective_parent = num_id if is_numbered else parent_num_id + for nested_list in li.find_all(["ul", "ol"], recursive=False): + _add_list_items( + doc, + nested_list, + level + 1, + list_state=list_state, + parent_num_id=effective_parent, + ) + + +def _add_formatted_text_to_paragraph( + paragraph, html_element, skip_nested_lists: bool = False +): """Add formatted text from HTML element to Word paragraph""" # Handle direct text and formatting for content in html_element.contents: + if ( + skip_nested_lists + and hasattr(content, "name") + and content.name in ("ul", "ol") + ): + continue if hasattr(content, "name") and content.name: # This is an HTML tag if content.name == "strong" or content.name == "b": @@ -641,7 +1254,9 @@ def _add_formatted_text_to_paragraph(paragraph, html_element): else: # Nested elements - recursively process only if it has contents if hasattr(content, "contents"): - _add_formatted_text_to_paragraph(paragraph, content) + _add_formatted_text_to_paragraph( + paragraph, content, skip_nested_lists=skip_nested_lists + ) else: # Just add the text content text = content.get_text() @@ -749,15 +1364,21 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): data={ "template_summary": { "structure": f"{analysis['paragraphs']}p,{analysis['tables']}t", - "fillable_total": int(analysis["fillable_paragraphs"] + analysis["fillable_cells"]), + "fillable_total": int( + analysis["fillable_paragraphs"] + analysis["fillable_cells"] + ), "content_elements_hidden": int( - analysis["total_elements"] - len(fillable_paragraphs) - len(fillable_cells) + analysis["total_elements"] + - len(fillable_paragraphs) + - len(fillable_cells) ), }, "fillable_paragraphs": fillable_paragraphs, "fillable_cells": fillable_cells, "pattern_distribution": pattern_counts, - "recommended_strategy": "mixed" if len(pattern_counts) > 2 else "single_method", + "recommended_strategy": "mixed" + if len(pattern_counts) > 2 + else "single_method", "template_ready": True, }, cost_usd=0.0, @@ -803,7 +1424,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): "markdown_processed": bool(markdown_content), } - return await save_and_return_document(result, document_id, context, custom_filename) + return await save_and_return_document( + result, document_id, context, custom_filename + ) @doc_maker.action("add_table") @@ -861,7 +1484,8 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): # Check if it's an image by extension or content type is_image_by_extension = any( - filename.lower().endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"] + filename.lower().endswith(ext) + for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"] ) is_image_by_content_type = content_type.startswith("image/") @@ -876,7 +1500,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): paragraph = doc.add_paragraph() if width and height: - paragraph.add_run().add_picture(image_file, width=Inches(width), height=Inches(height)) + paragraph.add_run().add_picture( + image_file, width=Inches(width), height=Inches(height) + ) elif width: paragraph.add_run().add_picture(image_file, width=Inches(width)) elif height: @@ -942,7 +1568,11 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): new_content = update["content"] # Get paragraph by index using iter_block_items - paragraphs = [block for block in iter_block_items(doc) if isinstance(block, Paragraph)] + paragraphs = [ + block + for block in iter_block_items(doc) + if isinstance(block, Paragraph) + ] if paragraph_index < len(paragraphs): paragraph = paragraphs[paragraph_index] @@ -961,22 +1591,32 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): new_content = update["content"] # Get table by index - tables = [block for block in iter_block_items(doc) if isinstance(block, Table)] + tables = [ + block for block in iter_block_items(doc) if isinstance(block, Table) + ] if table_index < len(tables): table = tables[table_index] if row < len(table.rows) and col < len(table.columns): cell = table.cell(row, col) cell.text = new_content - changes_made.append(f"Updated table {table_index} cell ({row},{col})") + changes_made.append( + f"Updated table {table_index} cell ({row},{col})" + ) else: - changes_made.append(f"Cell ({row},{col}) out of range in table {table_index}") + changes_made.append( + f"Cell ({row},{col}) out of range in table {table_index}" + ) else: changes_made.append(f"Table {table_index} not found") # Create LLM-optimized response successful_updates = [change for change in changes_made if "Updated" in change] - failed_updates = [change for change in changes_made if "not found" in change or "out of range" in change] + failed_updates = [ + change + for change in changes_made + if "not found" in change or "out of range" in change + ] original_result = { "success": len(successful_updates) > 0, @@ -984,7 +1624,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): "failed": len(failed_updates), "summary": f"Updated {len(successful_updates)} elements" + (f", {len(failed_updates)} failed" if failed_updates else ""), - "failures": failed_updates[:3] if failed_updates else [], # Limit failure details + "failures": failed_updates[:3] + if failed_updates + else [], # Limit failure details } return await save_and_return_document(original_result, document_id, context) @@ -1000,7 +1642,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): try: replacements = json.loads(replacements) except json.JSONDecodeError: - return ActionError(message="Invalid replacements format: must be array or valid JSON string") + return ActionError( + message="Invalid replacements format: must be array or valid JSON string" + ) case_sensitive = inputs.get("case_sensitive", False) files = inputs.get("files", []) @@ -1020,7 +1664,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): for replacement in replacements: find_text = replacement["find"] - replace_text = replacement.get("replace", "") # Default to empty string if not provided + replace_text = replacement.get( + "replace", "" + ) # Default to empty string if not provided replace_all = replacement.get("replace_all", False) remove_paragraph = replacement.get("remove_paragraph", False) @@ -1151,7 +1797,11 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): original_text = paragraph.text is_full_paragraph_match = original_text.strip() == find_text.strip() - if is_full_paragraph_match and replace_text.strip() == "" and remove_paragraph: + if ( + is_full_paragraph_match + and replace_text.strip() == "" + and remove_paragraph + ): # Mark paragraph for removal to eliminate spacing paragraphs_to_remove.append(paragraph) replacements_count += 1 @@ -1314,12 +1964,18 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): replacement_count += 1 if replacement_count > 0: - changes_made.append(f"Replaced '{placeholder}' {replacement_count} times") + changes_made.append( + f"Replaced '{placeholder}' {replacement_count} times" + ) # 2. Position-based updates if "position_data" in template_data: - paragraphs = [block for block in iter_block_items(doc) if isinstance(block, Paragraph)] - tables = [block for block in iter_block_items(doc) if isinstance(block, Table)] + paragraphs = [ + block for block in iter_block_items(doc) if isinstance(block, Paragraph) + ] + tables = [ + block for block in iter_block_items(doc) if isinstance(block, Table) + ] for position_key, new_content in template_data["position_data"].items(): if position_key.startswith("paragraph_"): @@ -1327,7 +1983,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): if idx < len(paragraphs): # Use centralized parser for all content if has_markdown_formatting(str(new_content)): - parse_and_apply_markdown_formatting(paragraphs[idx], str(new_content)) + parse_and_apply_markdown_formatting( + paragraphs[idx], str(new_content) + ) else: paragraphs[idx].text = str(new_content) changes_made.append(f"Updated paragraph {idx}") @@ -1345,10 +2003,14 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): cell = table.cell(row_idx, col_idx) # Use centralized parser for all content if has_markdown_formatting(str(new_content)): - parse_and_apply_markdown_formatting(cell, str(new_content)) + parse_and_apply_markdown_formatting( + cell, str(new_content) + ) else: cell.text = str(new_content) - changes_made.append(f"Updated table {table_idx} cell ({row_idx},{col_idx})") + changes_made.append( + f"Updated table {table_idx} cell ({row_idx},{col_idx})" + ) # 3. Search and replace patterns (with safety analysis) safety_warnings = [] @@ -1392,7 +2054,9 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): # Analyze safety if multiple matches if len(matches_found) > 1 and not replace_all: - safety_analysis = analyze_replacement_safety(find_text, matches_found) + safety_analysis = analyze_replacement_safety( + find_text, matches_found + ) if safety_analysis["safety_level"] == "high_risk": # Block high-risk replacements @@ -1431,9 +2095,15 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): for paragraph in doc.paragraphs: if find_text.lower() in paragraph.text.lower(): original_text = paragraph.text - is_full_match = original_text.strip().lower() == find_text.lower() + is_full_match = ( + original_text.strip().lower() == find_text.lower() + ) - if is_full_match and replace_text.strip() == "" and remove_paragraph: + if ( + is_full_match + and replace_text.strip() == "" + and remove_paragraph + ): paragraphs_to_remove.append(paragraph) replacement_count += 1 else: @@ -1485,30 +2155,42 @@ async def execute(self, inputs: Dict[str, Any], context: ExecutionContext): replacement_count += 1 if replacement_count > 0: - changes_made.append(f"Found and replaced '{find_text}' {replacement_count} times") + changes_made.append( + f"Found and replaced '{find_text}' {replacement_count} times" + ) # Create LLM-optimized response with prominent safety warnings - has_critical_warnings = any("CRITICAL_WARNING" in str(warning) for warning in safety_warnings) + has_critical_warnings = any( + "CRITICAL_WARNING" in str(warning) for warning in safety_warnings + ) blocked_operations = len([w for w in safety_warnings if "BLOCKED" in str(w)]) change_summary = {} for change in changes_made: if "Replaced" in change: - change_summary["placeholders"] = change_summary.get("placeholders", 0) + 1 + change_summary["placeholders"] = ( + change_summary.get("placeholders", 0) + 1 + ) elif "Found and replaced" in change: change_summary["searches"] = change_summary.get("searches", 0) + 1 elif "Updated" in change: change_summary["positions"] = change_summary.get("positions", 0) + 1 original_result = { - "SAFETY_STATUS": "CRITICAL_ISSUES_DETECTED" if has_critical_warnings else "OK", + "SAFETY_STATUS": "CRITICAL_ISSUES_DETECTED" + if has_critical_warnings + else "OK", "success": len(changes_made) > 0 and not has_critical_warnings, "completed_operations": len(changes_made), "blocked_operations": blocked_operations, "safety_warnings": safety_warnings, "filled_summary": change_summary, - "template_status": "partially_complete" if blocked_operations > 0 else "complete", - "action_required": "Review safety warnings and use more specific context" if safety_warnings else "none", + "template_status": "partially_complete" + if blocked_operations > 0 + else "complete", + "action_required": "Review safety warnings and use more specific context" + if safety_warnings + else "none", } return await save_and_return_document(original_result, document_id, context) diff --git a/doc-maker/tests/context.py b/doc-maker/tests/context.py index 6058d41e..d048f14e 100644 --- a/doc-maker/tests/context.py +++ b/doc-maker/tests/context.py @@ -4,7 +4,9 @@ # Add paths for imports FIRST sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../dependencies"))) +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../dependencies")) +) # Now we can import the doc-maker module try: @@ -16,7 +18,9 @@ integration_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) os.chdir(integration_dir) - spec = importlib.util.spec_from_file_location("doc_maker", os.path.join(integration_dir, "doc_maker.py")) + spec = importlib.util.spec_from_file_location( + "doc_maker", os.path.join(integration_dir, "doc_maker.py") + ) doc_maker_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(doc_maker_module) @@ -25,7 +29,9 @@ # Export both the integration instance and the module (for unit tests) doc_maker = doc_maker_module.doc_maker - doc_maker_functions = doc_maker_module # For accessing utility functions in unit tests + doc_maker_functions = ( + doc_maker_module # For accessing utility functions in unit tests + ) except ImportError as e: print(f"Import error: {e}") print("Available sys.path entries:") diff --git a/doc-maker/tests/test_doc_maker_integration.py b/doc-maker/tests/test_doc_maker_integration.py index b0fbca1e..63ccc8e4 100644 --- a/doc-maker/tests/test_doc_maker_integration.py +++ b/doc-maker/tests/test_doc_maker_integration.py @@ -26,7 +26,9 @@ _original_cwd = os.getcwd() os.chdir(_parent) -_spec = importlib.util.spec_from_file_location("doc_maker_mod_intg", os.path.join(_parent, "doc_maker.py")) +_spec = importlib.util.spec_from_file_location( + "doc_maker_mod_intg", os.path.join(_parent, "doc_maker.py") +) _mod = importlib.util.module_from_spec(_spec) _spec.loader.exec_module(_mod) os.chdir(_original_cwd) @@ -77,7 +79,9 @@ async def test_create_document_returns_valid_docx_binary(self, live_context): assert_valid_docx(file_obj) async def test_create_document_with_title(self, live_context): - result = await doc_maker.execute_action("create_document", {"title": "My Integration Test Doc"}, live_context) + result = await doc_maker.execute_action( + "create_document", {"title": "My Integration Test Doc"}, live_context + ) assert result.type == ResultType.ACTION data = result.result.data @@ -85,8 +89,12 @@ async def test_create_document_with_title(self, live_context): assert data["paragraph_count"] >= 1 async def test_create_document_with_markdown_content(self, live_context): - markdown = "# Test Heading\n\nThis is a **bold** paragraph.\n\n- Item one\n- Item two" - result = await doc_maker.execute_action("create_document", {"markdown_content": markdown}, live_context) + markdown = ( + "# Test Heading\n\nThis is a **bold** paragraph.\n\n- Item one\n- Item two" + ) + result = await doc_maker.execute_action( + "create_document", {"markdown_content": markdown}, live_context + ) assert result.type == ResultType.ACTION data = result.result.data @@ -96,7 +104,9 @@ async def test_create_document_with_markdown_content(self, live_context): class TestAddTable: async def test_add_table_to_document(self, live_context): - create_result = await doc_maker.execute_action("create_document", {}, live_context) + create_result = await doc_maker.execute_action( + "create_document", {}, live_context + ) document_id = create_result.result.data["document_id"] file_obj = create_result.result.data["file"] @@ -124,7 +134,9 @@ class TestSaveDocument: async def test_save_document_returns_valid_docx(self, live_context): create_result = await doc_maker.execute_action( "create_document", - {"markdown_content": "# Integration Test\n\nGenerated by doc-maker integration tests."}, + { + "markdown_content": "# Integration Test\n\nGenerated by doc-maker integration tests." + }, live_context, ) document_id = create_result.result.data["document_id"] @@ -161,7 +173,9 @@ async def test_save_missing_document_returns_error(self, live_context): class TestAddMarkdownContent: async def test_add_markdown_to_existing_document(self, live_context): - create_result = await doc_maker.execute_action("create_document", {}, live_context) + create_result = await doc_maker.execute_action( + "create_document", {}, live_context + ) document_id = create_result.result.data["document_id"] file_obj = create_result.result.data["file"] diff --git a/doc-maker/tests/test_doc_maker_unit.py b/doc-maker/tests/test_doc_maker_unit.py index f7dd0187..cd420aea 100644 --- a/doc-maker/tests/test_doc_maker_unit.py +++ b/doc-maker/tests/test_doc_maker_unit.py @@ -17,7 +17,9 @@ # Load the module from its file location (Integration.load() needs the cwd set) _original_cwd = os.getcwd() os.chdir(_parent) -_spec = importlib.util.spec_from_file_location("doc_maker_mod", os.path.join(_parent, "doc_maker.py")) +_spec = importlib.util.spec_from_file_location( + "doc_maker_mod", os.path.join(_parent, "doc_maker.py") +) _mod = importlib.util.module_from_spec(_spec) _spec.loader.exec_module(_mod) os.chdir(_original_cwd) @@ -28,6 +30,7 @@ is_likely_placeholder_context = _mod.is_likely_placeholder_context analyze_replacement_safety = _mod.analyze_replacement_safety _save_document_to_dict = _mod._save_document_to_dict +parse_markdown_to_docx = _mod.parse_markdown_to_docx documents = _mod.documents pytestmark = pytest.mark.unit @@ -51,7 +54,9 @@ def _make_docx_bytes() -> bytes: return buf.read() -def _make_file_item(name: str, data: bytes, content_type: str = "application/octet-stream") -> dict: +def _make_file_item( + name: str, data: bytes, content_type: str = "application/octet-stream" +) -> dict: return { "name": name, "contentType": content_type, @@ -101,12 +106,16 @@ def test_whitespace_only(self): assert is_ph is True def test_real_content_not_placeholder(self): - is_ph, pattern = detect_placeholder_patterns("This is a complete sentence with actual content.") + is_ph, pattern = detect_placeholder_patterns( + "This is a complete sentence with actual content." + ) assert is_ph is False assert pattern == "content" def test_business_content_not_placeholder(self): - is_ph, _ = detect_placeholder_patterns("The quarterly revenue exceeded expectations.") + is_ph, _ = detect_placeholder_patterns( + "The quarterly revenue exceeded expectations." + ) assert is_ph is False @@ -147,10 +156,20 @@ def test_instruction_phrase(self): assert is_likely_placeholder_context("insert data here", "data") is True def test_content_sentence_not_placeholder(self): - assert is_likely_placeholder_context("The project name should be descriptive.", "name") is False + assert ( + is_likely_placeholder_context( + "The project name should be descriptive.", "name" + ) + is False + ) def test_complete_sentence_not_placeholder(self): - assert is_likely_placeholder_context("The date for the meeting has been set.", "date") is False + assert ( + is_likely_placeholder_context( + "The date for the meeting has been set.", "date" + ) + is False + ) class TestAnalyzeReplacementSafety: @@ -261,7 +280,9 @@ async def test_save_missing_document_returns_action_error(self, mock_context): @pytest.mark.asyncio async def test_save_existing_document_succeeds(self, mock_context): # Create doc first - create_result = await doc_maker.execute_action("create_document", {}, mock_context) + create_result = await doc_maker.execute_action( + "create_document", {}, mock_context + ) doc_id = create_result.result.data["document_id"] result = await doc_maker.execute_action( @@ -322,7 +343,9 @@ async def test_get_elements_response_shape(self, mock_context): class TestAddTable: @pytest.mark.asyncio async def test_add_table_to_document(self, mock_context): - create_result = await doc_maker.execute_action("create_document", {}, mock_context) + create_result = await doc_maker.execute_action( + "create_document", {}, mock_context + ) doc_id = create_result.result.data["document_id"] docx_bytes = base64.b64decode(create_result.result.data["file"]["content"]) file_item = _make_file_item(f"{doc_id}.docx", docx_bytes) @@ -360,7 +383,9 @@ async def test_add_table_missing_document_raises(self, mock_context): class TestAddMarkdownContent: @pytest.mark.asyncio async def test_add_markdown_to_existing_document(self, mock_context): - create_result = await doc_maker.execute_action("create_document", {}, mock_context) + create_result = await doc_maker.execute_action( + "create_document", {}, mock_context + ) doc_id = create_result.result.data["document_id"] docx_bytes = base64.b64decode(create_result.result.data["file"]["content"]) file_item = _make_file_item(f"{doc_id}.docx", docx_bytes) @@ -391,7 +416,9 @@ async def test_add_markdown_missing_document_raises(self, mock_context): class TestAddPageBreak: @pytest.mark.asyncio async def test_add_page_break(self, mock_context): - create_result = await doc_maker.execute_action("create_document", {}, mock_context) + create_result = await doc_maker.execute_action( + "create_document", {}, mock_context + ) doc_id = create_result.result.data["document_id"] docx_bytes = base64.b64decode(create_result.result.data["file"]["content"]) file_item = _make_file_item(f"{doc_id}.docx", docx_bytes) @@ -431,7 +458,9 @@ async def test_update_paragraph_by_position(self, mock_context): "update_by_position", { "document_id": doc_id, - "updates": [{"type": "paragraph", "index": 0, "content": "Updated content"}], + "updates": [ + {"type": "paragraph", "index": 0, "content": "Updated content"} + ], "files": [file_item], }, mock_context, @@ -471,7 +500,9 @@ async def test_find_and_replace_basic(self, mock_context): "find_and_replace", { "document_id": doc_id, - "replacements": [{"find": "{{NAME}}", "replace": "Alice", "replace_all": True}], + "replacements": [ + {"find": "{{NAME}}", "replace": "Alice", "replace_all": True} + ], "files": [file_item], }, mock_context, @@ -506,7 +537,9 @@ async def test_find_no_match_returns_warning(self, mock_context): "find_and_replace", { "document_id": doc_id, - "replacements": [{"find": "NONEXISTENT_TEXT_XYZ", "replace": "replacement"}], + "replacements": [ + {"find": "NONEXISTENT_TEXT_XYZ", "replace": "replacement"} + ], "files": [file_item], }, mock_context, @@ -519,7 +552,9 @@ async def test_find_no_match_returns_warning(self, mock_context): async def test_find_and_replace_invalid_type_replacements(self, mock_context): # The SDK validates input schema: replacements must be array, so passing a # non-array non-string type triggers VALIDATION_ERROR before the handler runs. - create_result = await doc_maker.execute_action("create_document", {}, mock_context) + create_result = await doc_maker.execute_action( + "create_document", {}, mock_context + ) doc_id = create_result.result.data["document_id"] docx_bytes = base64.b64decode(create_result.result.data["file"]["content"]) file_item = _make_file_item(f"{doc_id}.docx", docx_bytes) @@ -583,3 +618,1070 @@ def test_missing_document_returns_error_dict(self): assert result["saved"] is False assert "nonexistent-id" in result["error"] assert result["file"]["content"] == "" + + +class TestParenthesizedListNumbering: + """Verify that (1), (a), (i) style lists produce correct Word numbering.""" + + MARKDOWN = ( + "1. Elephant\n" + " (a) Elephants are the largest land animals on Earth, " + "with African elephants weighing up to 14,000 lbs.\n" + " (b) They have an exceptional memory and can recognize " + "themselves in mirrors, indicating self-awareness.\n" + "2. Axolotl\n" + " (a) Axolotls can regenerate entire limbs, including " + "parts of their heart and brain.\n" + " (b) Unlike most amphibians, axolotls retain their larval " + "features throughout their entire lives, a trait called neoteny." + ) + + @staticmethod + def _get_numpr(paragraph): + """Return (numId, ilvl) from a paragraph's w:numPr, or None.""" + from docx.oxml.ns import qn + + pPr = paragraph._p.find(qn("w:pPr")) + if pPr is None: + return None + numPr = pPr.find(qn("w:numPr")) + if numPr is None: + return None + numId_el = numPr.find(qn("w:numId")) + ilvl_el = numPr.find(qn("w:ilvl")) + if numId_el is None or ilvl_el is None: + return None + return int(numId_el.get(qn("w:val"))), int(ilvl_el.get(qn("w:val"))) + + @staticmethod + def _get_abstract_num_for(doc, num_id): + """Return the abstractNum element referenced by a given numId.""" + from docx.oxml.ns import qn + + numbering = doc.part.numbering_part._element + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) == num_id: + abstract_ref = num_el.find(qn("w:abstractNumId")) + abstract_id = int(abstract_ref.get(qn("w:val"))) + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abstract_id: + return an + return None + + def test_produces_six_numbered_paragraphs(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + assert len(numbered) == 6, ( + f"Expected 6 numbered paragraphs, got {len(numbered)}: {numbered}" + ) + + def test_top_level_items_are_at_ilvl_zero(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + top_items = [ + (text, numpr) + for text, numpr in numbered + if "Elephant" == text or "Axolotl" == text + ] + assert len(top_items) == 2, f"Expected 2 top-level items, got {top_items}" + for text, (num_id, ilvl) in top_items: + assert ilvl == 0, f"'{text}' should be at ilvl 0, got {ilvl}" + + def test_sub_items_are_indented(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + sub_items = [ + (text, numpr) + for text, numpr in numbered + if text not in ("Elephant", "Axolotl") + ] + assert len(sub_items) == 4, f"Expected 4 sub-items, got {len(sub_items)}" + for text, (num_id, ilvl) in sub_items: + assert ilvl >= 1, ( + f"Sub-item should be indented (ilvl >= 1), got {ilvl}: {text}" + ) + + def test_top_level_uses_decimal_numbering(self): + from docx import Document + from docx.oxml.ns import qn + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + elephant = next((text, numpr) for text, numpr in numbered if text == "Elephant") + num_id = elephant[1][0] + abstract = self._get_abstract_num_for(doc, num_id) + assert abstract is not None + lvl0 = abstract.find(qn("w:lvl")) + fmt = lvl0.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "decimal", f"Top-level should be decimal, got {fmt}" + + def test_sub_items_use_lower_letter_parenthesized(self): + from docx import Document + from docx.oxml.ns import qn + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + first_sub = next( + (text, numpr) for text, numpr in numbered if "Elephants are" in text + ) + num_id, ilvl = first_sub[1] + abstract = self._get_abstract_num_for(doc, num_id) + assert abstract is not None + + # Find the lvl element matching the ilvl used + target_lvl = None + for lvl in abstract.findall(qn("w:lvl")): + if int(lvl.get(qn("w:ilvl"))) == ilvl: + target_lvl = lvl + break + assert target_lvl is not None + + fmt = target_lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "lowerLetter", f"Sub-items should be lowerLetter, got {fmt}" + lvl_text = target_lvl.find(qn("w:lvlText")).get(qn("w:val")) + assert "(" in lvl_text, ( + f"Sub-items should have parenthesized format, got '{lvl_text}'" + ) + + def test_elephant_text_on_same_line_as_number(self): + """The parent item text must appear in the same paragraph as the numbering.""" + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + elephant_paras = [(t, n) for t, n in numbered if "Elephant" in t and n[1] == 0] + assert len(elephant_paras) >= 1 + assert elephant_paras[0][0] == "Elephant", ( + f"Top-level text should be exactly 'Elephant', got '{elephant_paras[0][0]}'" + ) + + def test_parent_and_children_share_same_numid(self): + """Word requires nested lists to share the same numId to render correctly.""" + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + num_ids = set(numpr[0] for _, numpr in numbered) + assert len(num_ids) == 1, ( + f"All items should share one numId for coherent multilevel numbering, got {num_ids}" + ) + + +class TestMultipleParenListsAfterHeadings: + """Verify that multiple (1)-style lists separated by headings all display numbering + and are left-aligned when the markdown has no leading spaces.""" + + MARKDOWN = "# Animals\n(1) Elephant\n(2) Tiger\n# Fish\n(1) squid\n(2) Whale" + + @staticmethod + def _get_numpr(paragraph): + from docx.oxml.ns import qn + + pPr = paragraph._p.find(qn("w:pPr")) + if pPr is None: + return None + numPr = pPr.find(qn("w:numPr")) + if numPr is None: + return None + numId_el = numPr.find(qn("w:numId")) + ilvl_el = numPr.find(qn("w:ilvl")) + if numId_el is None or ilvl_el is None: + return None + return int(numId_el.get(qn("w:val"))), int(ilvl_el.get(qn("w:val"))) + + @staticmethod + def _get_abstract_num_for(doc, num_id): + from docx.oxml.ns import qn + + numbering = doc.part.numbering_part._element + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) == num_id: + abstract_ref = num_el.find(qn("w:abstractNumId")) + abstract_id = int(abstract_ref.get(qn("w:val"))) + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abstract_id: + return an + return None + + def test_both_lists_produce_numbered_paragraphs(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + assert len(numbered) == 4, ( + f"Expected 4 numbered paragraphs, got {len(numbered)}: {numbered}" + ) + + def test_each_list_has_its_own_abstract_num(self): + """Each independent list must get its own abstractNum to avoid Word dropping numbers.""" + from docx import Document + from docx.oxml.ns import qn + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + animals_num_id = numbered[0][1][0] + fish_num_id = numbered[2][1][0] + + animals_abstract = self._get_abstract_num_for(doc, animals_num_id) + fish_abstract = self._get_abstract_num_for(doc, fish_num_id) + + assert animals_abstract is not None + assert fish_abstract is not None + + animals_abstract_id = int(animals_abstract.get(qn("w:abstractNumId"))) + fish_abstract_id = int(fish_abstract.get(qn("w:abstractNumId"))) + assert animals_abstract_id != fish_abstract_id, ( + "Each list should reference a different abstractNum to prevent Word from dropping numbers" + ) + + def test_all_items_at_ilvl_zero(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + for text, (num_id, ilvl) in numbered: + assert ilvl == 0, f"'{text}' should be at ilvl 0, got {ilvl}" + + def test_level_zero_is_left_aligned(self): + """Level-0 paren lists with no leading spaces should be left-aligned (left=hanging, hanging=504).""" + from docx import Document + from docx.oxml.ns import qn + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + first_num_id = numbered[0][1][0] + abstract = self._get_abstract_num_for(doc, first_num_id) + assert abstract is not None + + lvl0 = None + for lvl in abstract.findall(qn("w:lvl")): + if int(lvl.get(qn("w:ilvl"))) == 0: + lvl0 = lvl + break + assert lvl0 is not None + + pPr = lvl0.find(qn("w:pPr")) + assert pPr is not None + ind = pPr.find(qn("w:ind")) + assert ind is not None + left = ind.get(qn("w:left")) + hanging = ind.get(qn("w:hanging")) + assert left == hanging, ( + f"Level 0 left indent should equal hanging (left-aligned), got left={left}, hanging={hanging}" + ) + + def test_all_use_decimal_parenthesized_format(self): + from docx import Document + from docx.oxml.ns import qn + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + + numbered = [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + for text, (num_id, ilvl) in numbered: + abstract = self._get_abstract_num_for(doc, num_id) + assert abstract is not None + for lvl in abstract.findall(qn("w:lvl")): + if int(lvl.get(qn("w:ilvl"))) == ilvl: + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "decimal", f"'{text}' should use decimal, got {fmt}" + lvl_text = lvl.find(qn("w:lvlText")).get(qn("w:val")) + assert "(" in lvl_text, ( + f"'{text}' should have paren format, got '{lvl_text}'" + ) + + +class TestMixedNumberedListFormats: + """Verify that five different numbered-list styles all render correctly: + 1. standard ``1. 2. 3.`` decimal + 2. parenthesized decimal ``(1) (2) (3)`` + 3. parenthesized lower letter ``(a) (b) (c)`` + 4. parenthesized upper letter ``(A) (B) (C)`` + 5. parenthesized lower roman ``(i) (ii) (iii)`` + + Each list must: + - be a real numbered (not bullet) list in the OOXML + - use the correct numFmt + - use left-aligned justification + - display the correct item text ("one", "two", "three") + - have a consistent hanging indent so text is aligned across items + whose numbering labels differ in width (e.g. ``(i)`` vs ``(iii)``). + """ + + MARKDOWN = ( + "# numbers\n" + "1. one\n" + "2. two\n" + "3. three\n" + "# numbers in brackets\n" + "(1) one\n" + "(2) two\n" + "(3) three\n" + "# letters in brackets\n" + "(a) one\n" + "(b) two\n" + "(c) three\n" + "# capital letters in brackets\n" + "(A) one\n" + "(B) two\n" + "(C) three\n" + "# roman numerals\n" + "(i) one\n" + "(ii) two\n" + "(iii) three" + ) + + @staticmethod + def _get_numpr(paragraph): + from docx.oxml.ns import qn + + pPr = paragraph._p.find(qn("w:pPr")) + if pPr is None: + return None + numPr = pPr.find(qn("w:numPr")) + if numPr is None: + return None + numId_el = numPr.find(qn("w:numId")) + ilvl_el = numPr.find(qn("w:ilvl")) + if numId_el is None or ilvl_el is None: + return None + return int(numId_el.get(qn("w:val"))), int(ilvl_el.get(qn("w:val"))) + + @staticmethod + def _get_abstract_num_for(doc, num_id): + from docx.oxml.ns import qn + + numbering = doc.part.numbering_part._element + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) == num_id: + abstract_ref = num_el.find(qn("w:abstractNumId")) + abstract_id = int(abstract_ref.get(qn("w:val"))) + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abstract_id: + return an + return None + + @staticmethod + def _get_lvl(abstract, ilvl): + from docx.oxml.ns import qn + + for lvl in abstract.findall(qn("w:lvl")): + if int(lvl.get(qn("w:ilvl"))) == ilvl: + return lvl + return None + + def _build_doc(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + return doc + + def _numbered_paragraphs(self, doc): + return [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + + # ---- 1. All 15 items are numbered paragraphs ---- + + def test_produces_fifteen_numbered_paragraphs(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + assert len(numbered) == 15, ( + f"Expected 15 numbered paragraphs (5 lists × 3 items), got {len(numbered)}: " + f"{[t for t, _ in numbered]}" + ) + + # ---- 2. Item text is correct ---- + + def test_item_text_is_correct(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + texts = [t for t, _ in numbered] + for i in range(5): + group = texts[i * 3 : i * 3 + 3] + assert group == ["one", "two", "three"], ( + f"List group {i} text should be ['one', 'two', 'three'], got {group}" + ) + + # ---- 3. Each list uses the correct numFmt ---- + + def test_standard_decimal_uses_decimal_format(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id = numbered[0][1][0] + ilvl = numbered[0][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "decimal", f"Standard numbered list should be decimal, got {fmt}" + + def test_paren_decimal_uses_decimal_format(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id = numbered[3][1][0] + ilvl = numbered[3][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "decimal", f"Paren decimal list should be decimal, got {fmt}" + + def test_paren_lower_letter_uses_lower_letter_format(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id = numbered[6][1][0] + ilvl = numbered[6][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "lowerLetter", ( + f"Lower letter list should be lowerLetter, got {fmt}" + ) + + def test_paren_upper_letter_uses_upper_letter_format(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id = numbered[9][1][0] + ilvl = numbered[9][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "upperLetter", ( + f"Upper letter list should be upperLetter, got {fmt}" + ) + + def test_paren_roman_uses_lower_roman_format(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id = numbered[12][1][0] + ilvl = numbered[12][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == "lowerRoman", ( + f"Roman numeral list should be lowerRoman, got {fmt}" + ) + + # ---- 4. Parenthesized lvlText for bracket lists ---- + + def test_paren_lists_use_parenthesized_lvl_text(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + # Lists at indices 3, 6, 9, 12 are the paren lists + for start_idx in (3, 6, 9, 12): + num_id = numbered[start_idx][1][0] + ilvl = numbered[start_idx][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + lvl_text = lvl.find(qn("w:lvlText")).get(qn("w:val")) + assert "(" in lvl_text and ")" in lvl_text, ( + f"Item '{numbered[start_idx][0]}' (idx {start_idx}) should have " + f"parenthesized lvlText, got '{lvl_text}'" + ) + + def test_standard_decimal_uses_dot_lvl_text(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id = numbered[0][1][0] + ilvl = numbered[0][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + lvl_text = lvl.find(qn("w:lvlText")).get(qn("w:val")) + assert "." in lvl_text, ( + f"Standard decimal should use dot format, got '{lvl_text}'" + ) + + # ---- 5. Left-aligned justification ---- + + def test_all_lists_are_left_aligned(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + checked = set() + for text, (num_id, ilvl) in numbered: + key = (num_id, ilvl) + if key in checked: + continue + checked.add(key) + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + jc = lvl.find(qn("w:lvlJc")) + assert jc is not None, f"'{text}' level should have lvlJc element" + assert jc.get(qn("w:val")) == "left", ( + f"'{text}' should be left-aligned, got '{jc.get(qn('w:val'))}'" + ) + + # ---- 6. Text alignment consistency (hanging indent) ---- + + def test_items_within_each_list_share_same_hanging_indent(self): + """All items in a single list must use the same hanging indent so that + the text column is aligned even when numbering labels vary in width + (e.g. ``(i)`` vs ``(iii)``).""" + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + + for start_idx in range(0, 15, 3): + group = numbered[start_idx : start_idx + 3] + # All items in a group share the same numId + ilvl, so they share + # the same abstractNum level definition → same indent. + num_id = group[0][1][0] + ilvl = group[0][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + pPr = lvl.find(qn("w:pPr")) + assert pPr is not None, f"Level {ilvl} should have pPr" + ind = pPr.find(qn("w:ind")) + assert ind is not None, f"Level {ilvl} should have indent" + hanging = ind.get(qn("w:hanging")) + left = ind.get(qn("w:left")) + assert hanging is not None, ( + f"Hanging indent should be set for list starting at idx {start_idx}" + ) + assert left is not None, ( + f"Left indent should be set for list starting at idx {start_idx}" + ) + + def test_all_lists_share_same_hanging_indent(self): + """All lists at the same indentation level must use the same left and + hanging indent so that item text aligns at the same column regardless + of whether the list uses ``1.``, ``(a)``, ``(A)``, or ``(iii)`` labels.""" + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + + indent_values = [] + for start_idx in range(0, 15, 3): + num_id = numbered[start_idx][1][0] + ilvl = numbered[start_idx][1][1] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + ind = lvl.find(qn("w:pPr")).find(qn("w:ind")) + hanging = ind.get(qn("w:hanging")) + left = ind.get(qn("w:left")) + indent_values.append((left, hanging)) + + first = indent_values[0] + for i, val in enumerate(indent_values): + assert val == first, ( + f"List group {i} indent {val} differs from group 0 indent {first}; " + f"all lists must share the same indent for consistent text alignment" + ) + + def test_all_levels_use_tab_suffix(self): + """Each numbering level must use ```` so Word + inserts a tab (not a space) after the label. This ensures text aligns + at the left-indent position regardless of label width.""" + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + + checked = set() + for text, (num_id, ilvl) in numbered: + key = (num_id, ilvl) + if key in checked: + continue + checked.add(key) + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + suff = lvl.find(qn("w:suff")) + assert suff is not None, f"'{text}' level should have a element" + assert suff.get(qn("w:val")) == "tab", ( + f"'{text}' suffix should be 'tab', got '{suff.get(qn('w:val'))}'" + ) + + +class TestOrderedListStartOverride: + """Verify that ordered lists respect the start number from the markdown. + + Markdown input: + # numbers + 1. one + 2. two + 3. three + # continuation of numbers + 4. four + 5. five + 6. six + + Expected: two separate lists, each with 3 items. + The first list numbers 1, 2, 3; the second list numbers 4, 5, 6. + """ + + MARKDOWN = ( + "# numbers\n" + "1. one\n" + "2. two\n" + "3. three\n" + "# continuation of numbers\n" + "4. four\n" + "5. five\n" + "6. six\n" + ) + + EXPECTED_ITEMS = [ + ("one", 1), + ("two", 2), + ("three", 3), + ("four", 4), + ("five", 5), + ("six", 6), + ] + + @staticmethod + def _get_numpr(paragraph): + from docx.oxml.ns import qn + + pPr = paragraph._p.find(qn("w:pPr")) + if pPr is None: + return None + numPr = pPr.find(qn("w:numPr")) + if numPr is None: + return None + numId_el = numPr.find(qn("w:numId")) + ilvl_el = numPr.find(qn("w:ilvl")) + if numId_el is None or ilvl_el is None: + return None + return int(numId_el.get(qn("w:val"))), int(ilvl_el.get(qn("w:val"))) + + @staticmethod + def _get_start_val(doc, num_id, ilvl): + """Return the effective start value for a numbering instance. + + Checks / first, then falls back to + the value in the abstract numbering level definition. + """ + from docx.oxml.ns import qn + + numbering = doc.part.numbering_part._element + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) != num_id: + continue + # Check for startOverride + for ovr in num_el.findall(qn("w:lvlOverride")): + if int(ovr.get(qn("w:ilvl"))) == ilvl: + start_ovr = ovr.find(qn("w:startOverride")) + if start_ovr is not None: + return int(start_ovr.get(qn("w:val"))) + # Fall back to abstract num + abs_ref = num_el.find(qn("w:abstractNumId")) + abs_id = int(abs_ref.get(qn("w:val"))) + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abs_id: + for lvl_el in an.findall(qn("w:lvl")): + if int(lvl_el.get(qn("w:ilvl"))) == ilvl: + start_el = lvl_el.find(qn("w:start")) + if start_el is not None: + return int(start_el.get(qn("w:val"))) + return None + + def _build_doc(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + return doc + + def _numbered_paragraphs(self, doc): + return [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + + def test_produces_six_numbered_paragraphs(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + assert len(numbered) == 6, ( + f"Expected 6 numbered paragraphs, got {len(numbered)}: " + f"{[t for t, _ in numbered]}" + ) + + def test_two_distinct_lists(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_ids = [numpr[0] for _, numpr in numbered] + distinct = list(dict.fromkeys(num_ids)) + assert len(distinct) == 2, ( + f"Expected 2 distinct numIds (two lists), got {len(distinct)}: {distinct}" + ) + + def test_each_list_has_three_items(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_ids = [numpr[0] for _, numpr in numbered] + distinct = list(dict.fromkeys(num_ids)) + first_count = sum(1 for n in num_ids if n == distinct[0]) + second_count = sum(1 for n in num_ids if n == distinct[1]) + assert first_count == 3, f"First list should have 3 items, got {first_count}" + assert second_count == 3, f"Second list should have 3 items, got {second_count}" + + def test_item_text_matches(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + for idx, (text, _) in enumerate(numbered): + expected_text = self.EXPECTED_ITEMS[idx][0] + assert text == expected_text, ( + f"Item {idx}: expected text {expected_text!r}, got {text!r}" + ) + + def test_first_list_starts_at_one(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id, ilvl = numbered[0][1] + start = self._get_start_val(doc, num_id, ilvl) + assert start == 1, f"First list should start at 1, got {start}" + + def test_second_list_starts_at_four(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + num_id, ilvl = numbered[3][1] + start = self._get_start_val(doc, num_id, ilvl) + assert start == 4, f"Second list should start at 4, got {start}" + + def test_effective_numbers_are_correct(self): + """Verify that the effective number for each item is correct by + checking the start value of its list and its position within the list.""" + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + + for idx, (text, (num_id, ilvl)) in enumerate(numbered): + list_start = self._get_start_val(doc, num_id, ilvl) + position_in_list = sum(1 for i in range(idx) if numbered[i][1][0] == num_id) + effective_number = list_start + position_in_list + expected_number = self.EXPECTED_ITEMS[idx][1] + assert effective_number == expected_number, ( + f"Item {idx} ({text!r}): expected number {expected_number}, " + f"got {effective_number} (list_start={list_start}, pos={position_in_list})" + ) + + def test_abstract_num_start_matches_override(self): + """The abstract numbering value must match the startOverride + so that renderers which ignore lvlOverride still produce correct + numbering.""" + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + numbering = doc.part.numbering_part._element + + # Second list (items 3-5) should have abstract start = 4 + num_id, ilvl = numbered[3][1] + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) != num_id: + continue + abs_ref = num_el.find(qn("w:abstractNumId")) + abs_id = int(abs_ref.get(qn("w:val"))) + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abs_id: + for lvl_el in an.findall(qn("w:lvl")): + if int(lvl_el.get(qn("w:ilvl"))) == ilvl: + start_el = lvl_el.find(qn("w:start")) + assert start_el is not None + assert int(start_el.get(qn("w:val"))) == 4, ( + f"Abstract numbering start should be 4, " + f"got {start_el.get(qn('w:val'))}" + ) + + +class TestNestedNumberedListIndentation: + """Verify deeply nested numbered lists with mixed parenthesized formats. + + Markdown input: + 1. one + (1) one + (a) one + (A) one + (i) one + (ii) two + (B) two + (i) one + (ii) two + (b) two + (A) one + (B) two + (2) two + (a) one + (b) two + 2. two + (1) one + (2) two + + Expected nesting levels (ilvl): + 0 → 1. / 2. + 1 → (1) / (2) + 2 → (a) / (b) + 3 → (A) / (B) + 4 → (i) / (ii) + + Each ilvl must have left indent = hanging * (ilvl + 1) where + hanging = 504 twips. + """ + + MARKDOWN = ( + "# nested numbered lists\n" + "1. one\n" + " (1) one\n" + "\t (a) one\n" + "\t\t (A) one\n" + "\t\t\t (i) one\n" + "\t\t\t\t(ii) two\n" + "\t\t\t(B) two\n" + "\t\t\t (i) one\n" + "\t\t\t\t(ii) two\n" + "\t\t(b) two\n" + "\t\t (A) one\n" + "\t\t\t(B) two\n" + "\t(2) two\n" + "\t (a) one\n" + "\t\t(b) two\n" + "2. two\n" + " (1) one\n" + "\t(2) two\n" + ) + + # (expected_text, expected_ilvl, expected_num_fmt) + EXPECTED_ITEMS = [ + ("one", 0, "decimal"), # 1. + ("one", 1, "decimal"), # (1) + ("one", 2, "lowerLetter"), # (a) + ("one", 3, "upperLetter"), # (A) + ("one", 4, "lowerRoman"), # (i) + ("two", 4, "lowerRoman"), # (ii) + ("two", 3, "upperLetter"), # (B) + ("one", 4, "lowerRoman"), # (i) + ("two", 4, "lowerRoman"), # (ii) + ("two", 2, "lowerLetter"), # (b) + ("one", 3, "upperLetter"), # (A) + ("two", 3, "upperLetter"), # (B) + ("two", 1, "decimal"), # (2) + ("one", 2, "lowerLetter"), # (a) + ("two", 2, "lowerLetter"), # (b) + ("two", 0, "decimal"), # 2. + ("one", 1, "decimal"), # (1) + ("two", 1, "decimal"), # (2) + ] + + HANGING = 504 # _LIST_HANGING_INDENT + + @staticmethod + def _get_numpr(paragraph): + from docx.oxml.ns import qn + + pPr = paragraph._p.find(qn("w:pPr")) + if pPr is None: + return None + numPr = pPr.find(qn("w:numPr")) + if numPr is None: + return None + numId_el = numPr.find(qn("w:numId")) + ilvl_el = numPr.find(qn("w:ilvl")) + if numId_el is None or ilvl_el is None: + return None + return int(numId_el.get(qn("w:val"))), int(ilvl_el.get(qn("w:val"))) + + @staticmethod + def _get_abstract_num_for(doc, num_id): + from docx.oxml.ns import qn + + numbering = doc.part.numbering_part._element + for num_el in numbering.findall(qn("w:num")): + if int(num_el.get(qn("w:numId"))) == num_id: + abstract_ref = num_el.find(qn("w:abstractNumId")) + abstract_id = int(abstract_ref.get(qn("w:val"))) + for an in numbering.findall(qn("w:abstractNum")): + if int(an.get(qn("w:abstractNumId"))) == abstract_id: + return an + return None + + @staticmethod + def _get_lvl(abstract, ilvl): + from docx.oxml.ns import qn + + for lvl in abstract.findall(qn("w:lvl")): + if int(lvl.get(qn("w:ilvl"))) == ilvl: + return lvl + return None + + def _build_doc(self): + from docx import Document + + doc = Document() + parse_markdown_to_docx(doc, self.MARKDOWN) + return doc + + def _numbered_paragraphs(self, doc): + return [ + (p.text.strip(), self._get_numpr(p)) + for p in doc.paragraphs + if self._get_numpr(p) + ] + + def test_produces_eighteen_numbered_paragraphs(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + assert len(numbered) == 18, ( + f"Expected 18 numbered paragraphs, got {len(numbered)}: " + f"{[t for t, _ in numbered]}" + ) + + def test_item_text_matches_expected(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + for idx, (text, _) in enumerate(numbered): + expected_text = self.EXPECTED_ITEMS[idx][0] + assert text == expected_text, ( + f"Item {idx}: expected text {expected_text!r}, got {text!r}" + ) + + def test_ilvl_matches_expected(self): + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + for idx, (text, (num_id, ilvl)) in enumerate(numbered): + expected_ilvl = self.EXPECTED_ITEMS[idx][1] + assert ilvl == expected_ilvl, ( + f"Item {idx} ({text!r}): expected ilvl={expected_ilvl}, got ilvl={ilvl}" + ) + + def test_num_fmt_matches_expected(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + for idx, (text, (num_id, ilvl)) in enumerate(numbered): + expected_fmt = self.EXPECTED_ITEMS[idx][2] + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + fmt = lvl.find(qn("w:numFmt")).get(qn("w:val")) + assert fmt == expected_fmt, ( + f"Item {idx} ({text!r}): expected numFmt={expected_fmt!r}, got {fmt!r}" + ) + + def test_left_indent_matches_ilvl(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + for idx, (text, (num_id, ilvl)) in enumerate(numbered): + expected_left = self.HANGING * (ilvl + 1) + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + pPr = lvl.find(qn("w:pPr")) + assert pPr is not None, f"Item {idx} ({text!r}): missing pPr" + ind = pPr.find(qn("w:ind")) + assert ind is not None, f"Item {idx} ({text!r}): missing ind" + left = int(ind.get(qn("w:left"))) + assert left == expected_left, ( + f"Item {idx} ({text!r}, ilvl={ilvl}): " + f"expected left indent={expected_left}, got {left}" + ) + + def test_hanging_indent_is_consistent(self): + from docx.oxml.ns import qn + + doc = self._build_doc() + numbered = self._numbered_paragraphs(doc) + for idx, (text, (num_id, ilvl)) in enumerate(numbered): + abstract = self._get_abstract_num_for(doc, num_id) + lvl = self._get_lvl(abstract, ilvl) + ind = lvl.find(qn("w:pPr")).find(qn("w:ind")) + hanging = int(ind.get(qn("w:hanging"))) + assert hanging == self.HANGING, ( + f"Item {idx} ({text!r}): expected hanging={self.HANGING}, got {hanging}" + )