From b39c3fb21026ef5de1195939bb1f817bc1a2972e Mon Sep 17 00:00:00 2001 From: Bruce-anle <840596168@qq.com> Date: Sat, 9 May 2026 00:52:11 +0800 Subject: [PATCH 1/2] fix: convert markdown headers and footers Background: convert_to_markdown reads word/header*.xml and word/footer*.xml, but passed w:hdr/w:ftr roots to parse_body_to_markdown. That parser only looked for w:body, so header/footer content was skipped.\n\nChanges: allow parse_body_to_markdown to traverse w:hdr and w:ftr roots directly while preserving normal w:document/w:body behavior.\n\nVerification: /home/brucean/doc4agent/.venv/bin/python -m pytest tests -q -p no:cacheprovider passed. --- .../converters/markdown_converter.py | 3 ++ tests/test_markdown_header_footer.py | 48 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tests/test_markdown_header_footer.py diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py index 0ff408f..e7c0842 100644 --- a/docx2everything/converters/markdown_converter.py +++ b/docx2everything/converters/markdown_converter.py @@ -511,6 +511,9 @@ def parse_body_to_markdown(root, numbering_info=None, hyperlinks=None, images=No markdown_parts = [] body = root.find(qn('w:body')) + if body is None and root.tag in (qn('w:hdr'), qn('w:ftr')): + body = root + if body is None: return '' diff --git a/tests/test_markdown_header_footer.py b/tests/test_markdown_header_footer.py new file mode 100644 index 0000000..43d016d --- /dev/null +++ b/tests/test_markdown_header_footer.py @@ -0,0 +1,48 @@ +import xml.etree.ElementTree as ET + +from docx2everything.converters.markdown_converter import parse_body_to_markdown + + +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def xml_root(xml): + return ET.fromstring(xml) + + +def test_header_root_is_converted_to_markdown(): + root = xml_root(f""" + + Header text + + """) + + markdown = parse_body_to_markdown(root) + + assert markdown == "Header text" + + +def test_footer_root_is_converted_to_markdown(): + root = xml_root(f""" + + Footer text + + """) + + markdown = parse_body_to_markdown(root) + + assert markdown == "Footer text" + + +def test_document_body_root_still_converts_to_markdown(): + root = xml_root(f""" + + + Body text + + + """) + + markdown = parse_body_to_markdown(root) + + assert markdown == "Body text" From f22671e447b08baadfc9b384ec0e29b49d761356 Mon Sep 17 00:00:00 2001 From: Bruce-anle <840596168@qq.com> Date: Sat, 9 May 2026 01:10:55 +0800 Subject: [PATCH 2/2] fix: use header and footer relationships Background: header and footer parts have their own relationship files. The markdown converter parsed header/footer XML but still used document.xml relationships, so header/footer hyperlinks and images could not resolve.\n\nChanges: load word/_rels/header*.xml.rels and word/_rels/footer*.xml.rels while parsing each header/footer part.\n\nVerification: /home/brucean/doc4agent/.venv/bin/python -m pytest tests -q -p no:cacheprovider passed.\n\nNote: this branch is stacked on the header/footer markdown parsing fix. --- .../converters/markdown_converter.py | 8 +- ...st_markdown_header_footer_relationships.py | 80 +++++++++++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 tests/test_markdown_header_footer_relationships.py diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py index e7c0842..559d52f 100644 --- a/docx2everything/converters/markdown_converter.py +++ b/docx2everything/converters/markdown_converter.py @@ -619,7 +619,9 @@ def convert_to_markdown(zipf, filelist, img_dir=None): try: header_xml = zipf.read(fname) header_root = ET.fromstring(header_xml) - header_md = parse_body_to_markdown(header_root, numbering_info, hyperlinks, images, img_dir, zipf, footnotes=footnotes, endnotes=endnotes, comments=comments, styles_info=styles_info, charts=charts) + header_rels = 'word/_rels/' + os.path.basename(fname) + '.rels' + header_hyperlinks, header_images = parse_relationships(zipf, header_rels) + header_md = parse_body_to_markdown(header_root, numbering_info, header_hyperlinks, header_images, img_dir, zipf, footnotes=footnotes, endnotes=endnotes, comments=comments, styles_info=styles_info, charts=charts) if header_md: markdown_parts.append(header_md) except Exception: @@ -644,7 +646,9 @@ def convert_to_markdown(zipf, filelist, img_dir=None): try: footer_xml = zipf.read(fname) footer_root = ET.fromstring(footer_xml) - footer_md = parse_body_to_markdown(footer_root, numbering_info, hyperlinks, images, img_dir, zipf, footnotes=footnotes, endnotes=endnotes, comments=comments, styles_info=styles_info, charts=charts) + footer_rels = 'word/_rels/' + os.path.basename(fname) + '.rels' + footer_hyperlinks, footer_images = parse_relationships(zipf, footer_rels) + footer_md = parse_body_to_markdown(footer_root, numbering_info, footer_hyperlinks, footer_images, img_dir, zipf, footnotes=footnotes, endnotes=endnotes, comments=comments, styles_info=styles_info, charts=charts) if footer_md: markdown_parts.append(footer_md) except Exception: diff --git a/tests/test_markdown_header_footer_relationships.py b/tests/test_markdown_header_footer_relationships.py new file mode 100644 index 0000000..08a9caa --- /dev/null +++ b/tests/test_markdown_header_footer_relationships.py @@ -0,0 +1,80 @@ +import io +import zipfile + +from docx2everything.converters.markdown_converter import convert_to_markdown + + +def make_docx(parts): + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w") as zipf: + for name, content in parts.items(): + zipf.writestr(name, content) + buffer.seek(0) + return zipfile.ZipFile(buffer) + + +def test_header_hyperlink_uses_header_relationships(): + docx = make_docx({ + "word/document.xml": """ + + + + """, + "word/header1.xml": """ + + + + Header link + + + + """, + "word/_rels/header1.xml.rels": """ + + + + """, + }) + + markdown = convert_to_markdown(docx, docx.namelist()) + + assert markdown == "[Header link](https://example.com/header)" + + +def test_footer_image_uses_footer_relationships(): + docx = make_docx({ + "word/document.xml": """ + + + + """, + "word/footer1.xml": """ + + + + + + + + + + """, + "word/_rels/footer1.xml.rels": """ + + + + """, + }) + + markdown = convert_to_markdown(docx, docx.namelist()) + + assert markdown == "![footer-image.png](media/footer-image.png)"