Skip to content

Commit 26d00a7

Browse files
TingquanGaoclaude
andauthored
bugfix: doc2md (#17943)
* fix(doc2md): filter PAGE field placeholders and empty dynamic fields in headers/footers - Add _RE_PAGE_ONLY regex to filter page-number-only text patterns such as "第 页", "共 页", "- 3 -" that result from unresolved dynamic fields - Add field state machine in _iter_paragraph_items to properly track fldChar begin/separate/end boundaries for non-hyperlink fields, ensuring fldChar control runs and instrText runs are skipped while cached result-phase text (if present) is preserved - Fixes word5.docx footer outputting "第 页" instead of being filtered Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix(doc2md): fix cross-paragraph field-code hyperlink tracking in docx Three bugs fixed: 1. Replace id()-based run lookup with _FieldState object to avoid lxml proxy GC reuse causing URL-to-run mismatches (Bug 1). 2. Pass _FieldState across paragraphs in _convert_body so HYPERLINK fields spanning paragraph boundaries (fldChar begin in para 1, end in para 2) correctly associate URLs with display text (Bug 2). 3. Suppress underline for field-code hyperlink result-phase runs, matching existing w:hyperlink element behaviour (Bug 3). Removes _parse_field_hyperlinks() and inlines the state machine into _iter_paragraph_items(), adding _FieldState and _update_field_state_for_paragraph() helpers. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix paddle version for py38 CI --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 158416e commit 26d00a7

2 files changed

Lines changed: 128 additions & 58 deletions

File tree

.github/workflows/tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
pip install pytest
6161
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
6262
if [[ "${{ matrix.python-version }}" == "3.8" ]]; then
63-
python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
63+
python -m pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
6464
else
6565
python -m pip install paddlepaddle==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
6666
fi

paddleocr/_doc2md/converters/docx.py

Lines changed: 127 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
# Regex for field-code hyperlink instruction
2828
_RE_FIELD_HYPERLINK = re.compile(r'HYPERLINK\s+"([^"]+)"')
2929

30+
# Regex for page-number-only footer/header text (e.g. "第 页", "共 页", "- 3 -", "Page of")
31+
_RE_PAGE_ONLY = re.compile(r"^[\s第页共of\d\-/|·]*$", re.IGNORECASE)
32+
3033
# Word XML namespace
3134
_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
3235
_W = "{" + _W_NS + "}"
@@ -297,65 +300,73 @@ def _detect_heading_level(para, body_font_size: float) -> int:
297300
return 0
298301

299302

300-
def _parse_field_hyperlinks(para) -> dict:
301-
"""Parse w:fldChar field-code hyperlinks in a paragraph.
303+
class _FieldState:
304+
"""Mutable state for tracking w:fldChar field codes across paragraphs."""
302305

303-
Returns {id(run_element): url} for runs that are display text of a HYPERLINK field.
304-
Handles nested fields (e.g. PAGEREF inside a hyperlink result phase) via depth counter.
305-
"""
306-
field_urls: dict[int, str] = {}
307-
phase = None # None | "instr" | "result"
308-
field_url = None
309-
nest_depth = 0 # depth of nested fields inside result phase
306+
__slots__ = ("active", "phase", "nest_depth", "url")
307+
active: bool
308+
phase: object # None | "instr" | "result"
309+
nest_depth: int
310+
url: object # None | str
310311

311-
for child in para._element:
312+
def __init__(self):
313+
self.active = False
314+
self.phase = None # None | "instr" | "result"
315+
self.nest_depth = 0
316+
self.url = None
317+
318+
319+
def _update_field_state_for_paragraph(para_element, field_state):
320+
"""Update field_state by scanning a paragraph element's runs.
321+
322+
Used for early-exit paragraphs (TOC / math / empty / code) to keep
323+
cross-paragraph field tracking accurate without building item lists.
324+
"""
325+
for child in para_element:
312326
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
313327
if tag != "r":
314328
continue
315-
fld_char = child.find(f"{_W}fldChar")
329+
fld_char = child.find(_W + "fldChar")
316330
if fld_char is not None:
317-
fld_type = fld_char.get(f"{_W}fldCharType")
331+
fld_type = fld_char.get(_W + "fldCharType")
318332
if fld_type == "begin":
319-
if phase == "result":
320-
# Nested field (e.g. PAGEREF), increase depth and skip
321-
nest_depth += 1
333+
if field_state.phase == "result":
334+
field_state.nest_depth += 1
322335
else:
323-
phase = "instr"
324-
field_url = None
336+
field_state.active = True
337+
field_state.phase = "instr"
338+
field_state.url = None
325339
elif fld_type == "separate":
326-
if nest_depth == 0:
327-
phase = "result"
340+
if field_state.nest_depth == 0:
341+
field_state.phase = "result"
328342
elif fld_type == "end":
329-
if nest_depth > 0:
330-
nest_depth -= 1
343+
if field_state.nest_depth > 0:
344+
field_state.nest_depth -= 1
331345
else:
332-
phase = None
333-
field_url = None
346+
field_state.active = False
347+
field_state.phase = None
348+
field_state.url = None
334349
continue
335-
336-
instr_elem = child.find(f"{_W}instrText")
337-
if instr_elem is not None and phase == "instr" and nest_depth == 0:
350+
instr_elem = child.find(_W + "instrText")
351+
if instr_elem is not None and field_state.phase == "instr":
338352
if instr_elem.text:
339353
m = _RE_FIELD_HYPERLINK.search(instr_elem.text)
340354
if m:
341-
field_url = m.group(1)
342-
continue
355+
field_state.url = m.group(1)
343356

344-
if phase == "result" and nest_depth == 0 and field_url:
345-
t_elem = child.find(f"{_W}t")
346-
if t_elem is not None and t_elem.text:
347-
field_urls[id(child)] = field_url
348357

349-
return field_urls
350-
351-
352-
def _iter_paragraph_items(para) -> list:
358+
def _iter_paragraph_items(para, field_state=None) -> list:
353359
"""Extract (bold, italic, underline, strikethrough, superscript, subscript, text, url) tuples from a paragraph in document order.
354360
355361
Handles python-docx Hyperlink objects and w:fldChar field-code hyperlinks.
356362
Silently degrades to plain text on error.
357-
Note: underline is forced to False inside Hyperlink runs to avoid Word's default hyperlink underline style.
363+
Note: underline is forced to False inside Hyperlink/field-hyperlink runs to avoid Word's default hyperlink underline style.
364+
365+
field_state: optional _FieldState instance for cross-paragraph field tracking.
366+
If None, a fresh _FieldState is created (single-paragraph mode).
358367
"""
368+
if field_state is None:
369+
field_state = _FieldState()
359370

360371
def _split_breaks(items):
361372
"""Expand items containing \\n (from <w:br/> soft line breaks) into per-line items with <br> separators."""
@@ -425,11 +436,6 @@ def _split_breaks(items):
425436
]
426437
)
427438

428-
try:
429-
field_urls = _parse_field_hyperlinks(para)
430-
except Exception:
431-
field_urls = {}
432-
433439
items = []
434440
try:
435441
content_iter = para.iter_inner_content()
@@ -480,22 +486,75 @@ def _split_breaks(items):
480486
(False, False, False, False, False, False, element.text, url)
481487
)
482488
else:
489+
# Plain Run — check for fldChar control elements first
490+
fld_char = element._element.find(_W + "fldChar")
491+
if fld_char is not None:
492+
fld_type = fld_char.get(_W + "fldCharType")
493+
if fld_type == "begin":
494+
if field_state.phase == "result":
495+
field_state.nest_depth += 1
496+
else:
497+
field_state.active = True
498+
field_state.phase = "instr"
499+
field_state.url = None
500+
elif fld_type == "separate":
501+
if field_state.nest_depth == 0:
502+
field_state.phase = "result"
503+
elif fld_type == "end":
504+
if field_state.nest_depth > 0:
505+
field_state.nest_depth -= 1
506+
else:
507+
field_state.active = False
508+
field_state.phase = None
509+
field_state.url = None
510+
continue
511+
512+
instr_elem = element._element.find(_W + "instrText")
513+
if instr_elem is not None:
514+
if field_state.active and field_state.phase == "instr":
515+
# Extract HYPERLINK url from instrText
516+
if instr_elem.text:
517+
m = _RE_FIELD_HYPERLINK.search(instr_elem.text)
518+
if m:
519+
field_state.url = m.group(1)
520+
continue # Never emit instrText run as content
521+
483522
# Plain Run
484523
if not element.text:
485524
continue
486-
url = field_urls.get(id(element._element), "")
487-
items.append(
488-
(
489-
_effective_bold(element, para),
490-
_effective_italic(element, para),
491-
_effective_underline(element, para),
492-
bool(element.font.strike),
493-
_effective_superscript(element),
494-
_effective_subscript(element),
495-
element.text,
496-
url,
525+
526+
# If we are in the result phase of a field-code hyperlink, apply URL
527+
# and suppress underline (same as w:hyperlink element handling above).
528+
if (
529+
field_state.phase == "result"
530+
and field_state.nest_depth == 0
531+
and field_state.url
532+
):
533+
items.append(
534+
(
535+
_effective_bold(element, para),
536+
_effective_italic(element, para),
537+
False, # suppress underline for field hyperlinks
538+
bool(element.font.strike),
539+
_effective_superscript(element),
540+
_effective_subscript(element),
541+
element.text,
542+
field_state.url,
543+
)
544+
)
545+
else:
546+
items.append(
547+
(
548+
_effective_bold(element, para),
549+
_effective_italic(element, para),
550+
_effective_underline(element, para),
551+
bool(element.font.strike),
552+
_effective_superscript(element),
553+
_effective_subscript(element),
554+
element.text,
555+
"",
556+
)
497557
)
498-
)
499558
except Exception:
500559
continue
501560

@@ -1265,6 +1324,7 @@ def flush_toc_buf():
12651324
lines.append("")
12661325
toc_buf.clear()
12671326

1327+
field_state = _FieldState()
12681328
for child in _flatten_body(doc.element.body):
12691329
tag = child.tag.split("}")[-1]
12701330

@@ -1278,6 +1338,7 @@ def flush_toc_buf():
12781338
if toc_text:
12791339
toc_anchor = _extract_toc_anchor(child)
12801340
toc_buf.append((toc_text, toc_anchor, toc_level))
1341+
_update_field_state_for_paragraph(child, field_state)
12811342
continue
12821343

12831344
# Non-TOC paragraph: flush any buffered TOC entries
@@ -1338,6 +1399,7 @@ def _flush_textbox():
13381399
lines.append(math_md)
13391400
lines.append("")
13401401
_flush_textbox()
1402+
_update_field_state_for_paragraph(child, field_state)
13411403
continue
13421404

13431405
if not text:
@@ -1353,19 +1415,21 @@ def _flush_textbox():
13531415
code_buf.append("") # preserve blank lines inside code blocks
13541416
elif lines and lines[-1] != "":
13551417
lines.append("")
1418+
_update_field_state_for_paragraph(child, field_state)
13561419
continue
13571420

13581421
# Code paragraph: buffer it without heading/inline formatting
13591422
if _is_code_paragraph(para):
13601423
code_buf.append(para.text)
13611424
_flush_textbox()
1425+
_update_field_state_for_paragraph(child, field_state)
13621426
continue
13631427

13641428
# Non-code paragraph: flush any buffered code first
13651429
flush_code_buf()
13661430

13671431
level = _detect_heading_level(para, body_font_size)
1368-
inline = _runs_to_markdown(_iter_paragraph_items(para)) or text
1432+
inline = _runs_to_markdown(_iter_paragraph_items(para, field_state)) or text
13691433

13701434
if level > 0:
13711435
# Strip outer **...** wrapping that headings may have inherited
@@ -1462,8 +1526,14 @@ def _collect_from_parts(parts, seen: set) -> list:
14621526
except Exception:
14631527
pass
14641528
text = " ".join(texts)
1465-
# Filter: skip empty text, pure digits (page numbers), and duplicates
1466-
if text and not text.strip().isdigit() and text not in seen:
1529+
# Filter: skip empty text, pure digits (page numbers),
1530+
# page-number patterns (e.g. "第 页", "第6页", "共 页"), and duplicates
1531+
if (
1532+
text
1533+
and not text.strip().isdigit()
1534+
and not _RE_PAGE_ONLY.match(text.strip())
1535+
and text not in seen
1536+
):
14671537
seen.add(text)
14681538
results.append(text)
14691539
except Exception:

0 commit comments

Comments
 (0)