2222 import docx
2323 from docx .document import Document as DocxDocument
2424 from docx .table import Table
25+ from docx .text .hyperlink import Hyperlink
2526 from docx .text .paragraph import Paragraph
27+ from docx .text .run import Run
2628 from lxml .etree import _Comment
2729
2830
@@ -89,6 +91,31 @@ def from_str(string: str) -> "DOCXTableFormat":
8991 return table_format
9092
9193
94+ class DOCXLinkFormat (Enum ):
95+ """
96+ Supported formats for storing DOCX link information in a Document.
97+ """
98+
99+ MARKDOWN = "markdown"
100+ PLAIN = "plain"
101+ NONE = "none"
102+
103+ def __str__ (self ):
104+ return self .value
105+
106+ @staticmethod
107+ def from_str (string : str ) -> "DOCXLinkFormat" :
108+ """
109+ Convert a string to a DOCXLinkFormat enum.
110+ """
111+ enum_map = {e .value : e for e in DOCXLinkFormat }
112+ link_format = enum_map .get (string .lower ())
113+ if link_format is None :
114+ msg = f"Unknown link format '{ string } '. Supported formats are: { list (enum_map .keys ())} "
115+ raise ValueError (msg )
116+ return link_format
117+
118+
92119@component
93120class DOCXToDocument :
94121 """
@@ -99,28 +126,38 @@ class DOCXToDocument:
99126
100127 Usage example:
101128 ```python
102- from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
129+ from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
103130
104- converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
131+ converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN )
105132 results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
106133 documents = results["documents"]
107134 print(documents[0].content)
108135 # 'This is a text from the DOCX file.'
109136 ```
110137 """
111138
112- def __init__ (self , table_format : Union [str , DOCXTableFormat ] = DOCXTableFormat .CSV , store_full_path : bool = False ):
139+ def __init__ (
140+ self ,
141+ table_format : Union [str , DOCXTableFormat ] = DOCXTableFormat .CSV ,
142+ link_format : Union [str , DOCXLinkFormat ] = DOCXLinkFormat .NONE ,
143+ store_full_path : bool = False ,
144+ ):
113145 """
114146 Create a DOCXToDocument component.
115147
116148 :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
117- DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
149+ DOCXTableFormat.CSV, "markdown", or "csv".
150+ :param link_format: The format for link output. Can be either:
151+ DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address),
152+ DOCXLinkFormat.PLAIN or "plain" to get text (address),
153+ DOCXLinkFormat.NONE or "none" to get text without links.
118154 :param store_full_path:
119155 If True, the full path of the file is stored in the metadata of the document.
120156 If False, only the file name is stored.
121157 """
122158 docx_import .check ()
123159 self .table_format = DOCXTableFormat .from_str (table_format ) if isinstance (table_format , str ) else table_format
160+ self .link_format = DOCXLinkFormat .from_str (link_format ) if isinstance (link_format , str ) else link_format
124161 self .store_full_path = store_full_path
125162
126163 def to_dict (self ) -> Dict [str , Any ]:
@@ -130,7 +167,12 @@ def to_dict(self) -> Dict[str, Any]:
130167 :returns:
131168 Dictionary with serialized data.
132169 """
133- return default_to_dict (self , table_format = str (self .table_format ), store_full_path = self .store_full_path )
170+ return default_to_dict (
171+ self ,
172+ table_format = str (self .table_format ),
173+ link_format = str (self .link_format ),
174+ store_full_path = self .store_full_path ,
175+ )
134176
135177 @classmethod
136178 def from_dict (cls , data : Dict [str , Any ]) -> "DOCXToDocument" :
@@ -144,6 +186,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
144186 """
145187 if "table_format" in data ["init_parameters" ]:
146188 data ["init_parameters" ]["table_format" ] = DOCXTableFormat .from_str (data ["init_parameters" ]["table_format" ])
189+ if "link_format" in data ["init_parameters" ]:
190+ data ["init_parameters" ]["link_format" ] = DOCXLinkFormat .from_str (data ["init_parameters" ]["link_format" ])
147191 return default_from_dict (cls , data )
148192
149193 @component .output_types (documents = List [Document ])
@@ -218,7 +262,7 @@ def _extract_elements(self, document: "DocxDocument") -> List[str]:
218262 if paragraph .contains_page_break :
219263 para_text = self ._process_paragraph_with_page_breaks (paragraph )
220264 else :
221- para_text = paragraph . text
265+ para_text = self . _process_links_in_paragraph ( paragraph )
222266 elements .append (para_text )
223267 elif element .tag .endswith ("tbl" ):
224268 table = docx .table .Table (element , document )
@@ -244,18 +288,42 @@ def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
244288 # Can only extract text from first paragraph page break, unfortunately
245289 if pb_index == 0 :
246290 if page_break .preceding_paragraph_fragment :
247- para_text += page_break .preceding_paragraph_fragment . text
291+ para_text += self . _process_links_in_paragraph ( page_break .preceding_paragraph_fragment )
248292 para_text += "\f "
249293 if page_break .following_paragraph_fragment :
250294 # following_paragraph_fragment contains all text for remainder of paragraph.
251295 # However, if the remainder of the paragraph spans multiple page breaks, it won't include
252296 # those later page breaks so we have to add them at end of text in the `else` block below.
253297 # This is not ideal, but this case should be very rare and this is likely good enough.
254- para_text += page_break .following_paragraph_fragment . text
298+ para_text += self . _process_links_in_paragraph ( page_break .following_paragraph_fragment )
255299 else :
256300 para_text += "\f "
257301 return para_text
258302
303+ def _process_links_in_paragraph (self , paragraph : "Paragraph" ) -> str :
304+ """
305+ Processes links in a paragraph and formats them according to the specified link format.
306+
307+ :param paragraph: The DOCX paragraph to process.
308+ :returns: A string with links formatted according to the specified format.
309+ """
310+ if self .link_format == DOCXLinkFormat .NONE :
311+ return paragraph .text
312+ text = ""
313+ # Iterate over all hyperlinks and other content in the paragraph
314+ # https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content
315+ for content in paragraph .iter_inner_content ():
316+ if isinstance (content , Run ):
317+ text += content .text
318+ elif isinstance (content , Hyperlink ):
319+ if self .link_format == DOCXLinkFormat .MARKDOWN :
320+ formatted_link = f"[{ content .text } ]({ content .address } )"
321+ else : # PLAIN format
322+ formatted_link = f"{ content .text } ({ content .address } )"
323+ text += formatted_link
324+
325+ return text
326+
259327 def _table_to_markdown (self , table : "Table" ) -> str :
260328 """
261329 Converts a DOCX table to a Markdown string.
0 commit comments