2727# Regex for field-code hyperlink instruction
2828_RE_FIELD_HYPERLINK = re .compile (r'HYPERLINK\s+"([^"]+)"' )
2929
30+ # Regex for page-number-only footer/header text (e.g. "第 页", "共 页", "- 3 -", "Page of")
31+ _RE_PAGE_ONLY = re .compile (r"^[\s第页共of\d\-/|·]*$" , re .IGNORECASE )
32+
3033# Word XML namespace
3134_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
3235_W = "{" + _W_NS + "}"
@@ -297,65 +300,73 @@ def _detect_heading_level(para, body_font_size: float) -> int:
297300 return 0
298301
299302
300- def _parse_field_hyperlinks ( para ) -> dict :
301- """Parse w:fldChar field-code hyperlinks in a paragraph.
303+ class _FieldState :
304+ """Mutable state for tracking w:fldChar field codes across paragraphs."""
302305
303- Returns {id(run_element): url} for runs that are display text of a HYPERLINK field.
304- Handles nested fields (e.g. PAGEREF inside a hyperlink result phase) via depth counter.
305- """
306- field_urls : dict [int , str ] = {}
307- phase = None # None | "instr" | "result"
308- field_url = None
309- nest_depth = 0 # depth of nested fields inside result phase
306+ __slots__ = ("active" , "phase" , "nest_depth" , "url" )
307+ active : bool
308+ phase : object # None | "instr" | "result"
309+ nest_depth : int
310+ url : object # None | str
310311
311- for child in para ._element :
312+ def __init__ (self ):
313+ self .active = False
314+ self .phase = None # None | "instr" | "result"
315+ self .nest_depth = 0
316+ self .url = None
317+
318+
319+ def _update_field_state_for_paragraph (para_element , field_state ):
320+ """Update field_state by scanning a paragraph element's runs.
321+
322+ Used for early-exit paragraphs (TOC / math / empty / code) to keep
323+ cross-paragraph field tracking accurate without building item lists.
324+ """
325+ for child in para_element :
312326 tag = child .tag .split ("}" )[- 1 ] if "}" in child .tag else child .tag
313327 if tag != "r" :
314328 continue
315- fld_char = child .find (f" { _W } fldChar" )
329+ fld_char = child .find (_W + " fldChar" )
316330 if fld_char is not None :
317- fld_type = fld_char .get (f" { _W } fldCharType" )
331+ fld_type = fld_char .get (_W + " fldCharType" )
318332 if fld_type == "begin" :
319- if phase == "result" :
320- # Nested field (e.g. PAGEREF), increase depth and skip
321- nest_depth += 1
333+ if field_state .phase == "result" :
334+ field_state .nest_depth += 1
322335 else :
323- phase = "instr"
324- field_url = None
336+ field_state .active = True
337+ field_state .phase = "instr"
338+ field_state .url = None
325339 elif fld_type == "separate" :
326- if nest_depth == 0 :
327- phase = "result"
340+ if field_state . nest_depth == 0 :
341+ field_state . phase = "result"
328342 elif fld_type == "end" :
329- if nest_depth > 0 :
330- nest_depth -= 1
343+ if field_state . nest_depth > 0 :
344+ field_state . nest_depth -= 1
331345 else :
332- phase = None
333- field_url = None
346+ field_state .active = False
347+ field_state .phase = None
348+ field_state .url = None
334349 continue
335-
336- instr_elem = child .find (f"{ _W } instrText" )
337- if instr_elem is not None and phase == "instr" and nest_depth == 0 :
350+ instr_elem = child .find (_W + "instrText" )
351+ if instr_elem is not None and field_state .phase == "instr" :
338352 if instr_elem .text :
339353 m = _RE_FIELD_HYPERLINK .search (instr_elem .text )
340354 if m :
341- field_url = m .group (1 )
342- continue
355+ field_state .url = m .group (1 )
343356
344- if phase == "result" and nest_depth == 0 and field_url :
345- t_elem = child .find (f"{ _W } t" )
346- if t_elem is not None and t_elem .text :
347- field_urls [id (child )] = field_url
348357
349- return field_urls
350-
351-
352- def _iter_paragraph_items (para ) -> list :
358+ def _iter_paragraph_items (para , field_state = None ) -> list :
353359 """Extract (bold, italic, underline, strikethrough, superscript, subscript, text, url) tuples from a paragraph in document order.
354360
355361 Handles python-docx Hyperlink objects and w:fldChar field-code hyperlinks.
356362 Silently degrades to plain text on error.
357- Note: underline is forced to False inside Hyperlink runs to avoid Word's default hyperlink underline style.
363+ Note: underline is forced to False inside Hyperlink/field-hyperlink runs to avoid Word's default hyperlink underline style.
364+
365+ field_state: optional _FieldState instance for cross-paragraph field tracking.
366+ If None, a fresh _FieldState is created (single-paragraph mode).
358367 """
368+ if field_state is None :
369+ field_state = _FieldState ()
359370
360371 def _split_breaks (items ):
361372 """Expand items containing \\ n (from <w:br/> soft line breaks) into per-line items with <br> separators."""
@@ -425,11 +436,6 @@ def _split_breaks(items):
425436 ]
426437 )
427438
428- try :
429- field_urls = _parse_field_hyperlinks (para )
430- except Exception :
431- field_urls = {}
432-
433439 items = []
434440 try :
435441 content_iter = para .iter_inner_content ()
@@ -480,22 +486,75 @@ def _split_breaks(items):
480486 (False , False , False , False , False , False , element .text , url )
481487 )
482488 else :
489+ # Plain Run — check for fldChar control elements first
490+ fld_char = element ._element .find (_W + "fldChar" )
491+ if fld_char is not None :
492+ fld_type = fld_char .get (_W + "fldCharType" )
493+ if fld_type == "begin" :
494+ if field_state .phase == "result" :
495+ field_state .nest_depth += 1
496+ else :
497+ field_state .active = True
498+ field_state .phase = "instr"
499+ field_state .url = None
500+ elif fld_type == "separate" :
501+ if field_state .nest_depth == 0 :
502+ field_state .phase = "result"
503+ elif fld_type == "end" :
504+ if field_state .nest_depth > 0 :
505+ field_state .nest_depth -= 1
506+ else :
507+ field_state .active = False
508+ field_state .phase = None
509+ field_state .url = None
510+ continue
511+
512+ instr_elem = element ._element .find (_W + "instrText" )
513+ if instr_elem is not None :
514+ if field_state .active and field_state .phase == "instr" :
515+ # Extract HYPERLINK url from instrText
516+ if instr_elem .text :
517+ m = _RE_FIELD_HYPERLINK .search (instr_elem .text )
518+ if m :
519+ field_state .url = m .group (1 )
520+ continue # Never emit instrText run as content
521+
483522 # Plain Run
484523 if not element .text :
485524 continue
486- url = field_urls .get (id (element ._element ), "" )
487- items .append (
488- (
489- _effective_bold (element , para ),
490- _effective_italic (element , para ),
491- _effective_underline (element , para ),
492- bool (element .font .strike ),
493- _effective_superscript (element ),
494- _effective_subscript (element ),
495- element .text ,
496- url ,
525+
526+ # If we are in the result phase of a field-code hyperlink, apply URL
527+ # and suppress underline (same as w:hyperlink element handling above).
528+ if (
529+ field_state .phase == "result"
530+ and field_state .nest_depth == 0
531+ and field_state .url
532+ ):
533+ items .append (
534+ (
535+ _effective_bold (element , para ),
536+ _effective_italic (element , para ),
537+ False , # suppress underline for field hyperlinks
538+ bool (element .font .strike ),
539+ _effective_superscript (element ),
540+ _effective_subscript (element ),
541+ element .text ,
542+ field_state .url ,
543+ )
544+ )
545+ else :
546+ items .append (
547+ (
548+ _effective_bold (element , para ),
549+ _effective_italic (element , para ),
550+ _effective_underline (element , para ),
551+ bool (element .font .strike ),
552+ _effective_superscript (element ),
553+ _effective_subscript (element ),
554+ element .text ,
555+ "" ,
556+ )
497557 )
498- )
499558 except Exception :
500559 continue
501560
@@ -1265,6 +1324,7 @@ def flush_toc_buf():
12651324 lines .append ("" )
12661325 toc_buf .clear ()
12671326
1327+ field_state = _FieldState ()
12681328 for child in _flatten_body (doc .element .body ):
12691329 tag = child .tag .split ("}" )[- 1 ]
12701330
@@ -1278,6 +1338,7 @@ def flush_toc_buf():
12781338 if toc_text :
12791339 toc_anchor = _extract_toc_anchor (child )
12801340 toc_buf .append ((toc_text , toc_anchor , toc_level ))
1341+ _update_field_state_for_paragraph (child , field_state )
12811342 continue
12821343
12831344 # Non-TOC paragraph: flush any buffered TOC entries
@@ -1338,6 +1399,7 @@ def _flush_textbox():
13381399 lines .append (math_md )
13391400 lines .append ("" )
13401401 _flush_textbox ()
1402+ _update_field_state_for_paragraph (child , field_state )
13411403 continue
13421404
13431405 if not text :
@@ -1353,19 +1415,21 @@ def _flush_textbox():
13531415 code_buf .append ("" ) # preserve blank lines inside code blocks
13541416 elif lines and lines [- 1 ] != "" :
13551417 lines .append ("" )
1418+ _update_field_state_for_paragraph (child , field_state )
13561419 continue
13571420
13581421 # Code paragraph: buffer it without heading/inline formatting
13591422 if _is_code_paragraph (para ):
13601423 code_buf .append (para .text )
13611424 _flush_textbox ()
1425+ _update_field_state_for_paragraph (child , field_state )
13621426 continue
13631427
13641428 # Non-code paragraph: flush any buffered code first
13651429 flush_code_buf ()
13661430
13671431 level = _detect_heading_level (para , body_font_size )
1368- inline = _runs_to_markdown (_iter_paragraph_items (para )) or text
1432+ inline = _runs_to_markdown (_iter_paragraph_items (para , field_state )) or text
13691433
13701434 if level > 0 :
13711435 # Strip outer **...** wrapping that headings may have inherited
@@ -1462,8 +1526,14 @@ def _collect_from_parts(parts, seen: set) -> list:
14621526 except Exception :
14631527 pass
14641528 text = " " .join (texts )
1465- # Filter: skip empty text, pure digits (page numbers), and duplicates
1466- if text and not text .strip ().isdigit () and text not in seen :
1529+ # Filter: skip empty text, pure digits (page numbers),
1530+ # page-number patterns (e.g. "第 页", "第6页", "共 页"), and duplicates
1531+ if (
1532+ text
1533+ and not text .strip ().isdigit ()
1534+ and not _RE_PAGE_ONLY .match (text .strip ())
1535+ and text not in seen
1536+ ):
14671537 seen .add (text )
14681538 results .append (text )
14691539 except Exception :
0 commit comments