deepset-ai
diff --git a/‎haystack/components/converters/docx.py‎
Lines changed: 76 additions & 8 deletions b/‎haystack/components/converters/docx.py‎
Lines changed: 76 additions & 8 deletions
diff --git a/‎releasenotes/notes/docx-links-e389808e1ec52d57.yaml‎
Lines changed: 5 additions & 0 deletions b/‎releasenotes/notes/docx-links-e389808e1ec52d57.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎test/components/converters/test_docx_file_to_document.py‎
Lines changed: 62 additions & 11 deletions b/‎test/components/converters/test_docx_file_to_document.py‎
Lines changed: 62 additions & 11 deletions
diff --git a/‎test/test_files/docx/sample_docx_with_links.docx‎
15.3 KB b/‎test/test_files/docx/sample_docx_with_links.docx‎
15.3 KB
diff --git a/‎test/test_files/docx/sample_docx_with_single_link.docx‎
14.4 KB b/‎test/test_files/docx/sample_docx_with_single_link.docx‎
14.4 KB
@@ -22,7 +22,9 @@
     import docx
     from docx.document import Document as DocxDocument
     from docx.table import Table
+    from docx.text.hyperlink import Hyperlink
     from docx.text.paragraph import Paragraph
+    from docx.text.run import Run
     from lxml.etree import _Comment
 
 
@@ -89,6 +91,31 @@ def from_str(string: str) -> "DOCXTableFormat":
         return table_format
 
 
+class DOCXLinkFormat(Enum):
+    """
+    Supported formats for storing DOCX link information in a Document.
+    """
+
+    MARKDOWN = "markdown"
+    PLAIN = "plain"
+    NONE = "none"
+
+    def __str__(self):
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "DOCXLinkFormat":
+        """
+        Convert a string to a DOCXLinkFormat enum.
+        """
+        enum_map = {e.value: e for e in DOCXLinkFormat}
+        link_format = enum_map.get(string.lower())
+        if link_format is None:
+            msg = f"Unknown link format '{string}'. Supported formats are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return link_format
+
+
 @component
 class DOCXToDocument:
     """
@@ -99,28 +126,38 @@ class DOCXToDocument:
 
     Usage example:
     ```python
-    from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
+    from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
 
-    converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
+    converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN)
     results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the DOCX file.'
     ```
     """
 
-    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
+    def __init__(
+        self,
+        table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV,
+        link_format: Union[str, DOCXLinkFormat] = DOCXLinkFormat.NONE,
+        store_full_path: bool = False,
+    ):
         """
         Create a DOCXToDocument component.
 
         :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
-            DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
+            DOCXTableFormat.CSV, "markdown", or "csv".
+        :param link_format: The format for link output. Can be either:
+            DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address),
+            DOCXLinkFormat.PLAIN or "plain" to get text (address),
+            DOCXLinkFormat.NONE or "none" to get text without links.
         :param store_full_path:
             If True, the full path of the file is stored in the metadata of the document.
             If False, only the file name is stored.
         """
         docx_import.check()
         self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
+        self.link_format = DOCXLinkFormat.from_str(link_format) if isinstance(link_format, str) else link_format
         self.store_full_path = store_full_path
 
     def to_dict(self) -> Dict[str, Any]:
@@ -130,7 +167,12 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
-        return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
+        return default_to_dict(
+            self,
+            table_format=str(self.table_format),
+            link_format=str(self.link_format),
+            store_full_path=self.store_full_path,
+        )
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
@@ -144,6 +186,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
         """
         if "table_format" in data["init_parameters"]:
             data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
+        if "link_format" in data["init_parameters"]:
+            data["init_parameters"]["link_format"] = DOCXLinkFormat.from_str(data["init_parameters"]["link_format"])
         return default_from_dict(cls, data)
 
     @component.output_types(documents=List[Document])
@@ -218,7 +262,7 @@ def _extract_elements(self, document: "DocxDocument") -> List[str]:
                 if paragraph.contains_page_break:
                     para_text = self._process_paragraph_with_page_breaks(paragraph)
                 else:
-                    para_text = paragraph.text
+                    para_text = self._process_links_in_paragraph(paragraph)
                 elements.append(para_text)
             elif element.tag.endswith("tbl"):
                 table = docx.table.Table(element, document)
@@ -244,18 +288,42 @@ def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
             # Can only extract text from first paragraph page break, unfortunately
             if pb_index == 0:
                 if page_break.preceding_paragraph_fragment:
-                    para_text += page_break.preceding_paragraph_fragment.text
+                    para_text += self._process_links_in_paragraph(page_break.preceding_paragraph_fragment)
                 para_text += "\f"
                 if page_break.following_paragraph_fragment:
                     # following_paragraph_fragment contains all text for remainder of paragraph.
                     # However, if the remainder of the paragraph spans multiple page breaks, it won't include
                     # those later page breaks so we have to add them at end of text in the `else` block below.
                     # This is not ideal, but this case should be very rare and this is likely good enough.
-                    para_text += page_break.following_paragraph_fragment.text
+                    para_text += self._process_links_in_paragraph(page_break.following_paragraph_fragment)
             else:
                 para_text += "\f"
         return para_text
 
+    def _process_links_in_paragraph(self, paragraph: "Paragraph") -> str:
+        """
+        Processes links in a paragraph and formats them according to the specified link format.
+
+        :param paragraph: The DOCX paragraph to process.
+        :returns: A string with links formatted according to the specified format.
+        """
+        if self.link_format == DOCXLinkFormat.NONE:
+            return paragraph.text
+        text = ""
+        # Iterate over all hyperlinks and other content in the paragraph
+        # https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content
+        for content in paragraph.iter_inner_content():
+            if isinstance(content, Run):
+                text += content.text
+            elif isinstance(content, Hyperlink):
+                if self.link_format == DOCXLinkFormat.MARKDOWN:
+                    formatted_link = f"[{content.text}]({content.address})"
+                else:  # PLAIN format
+                    formatted_link = f"{content.text} ({content.address})"
+                text += formatted_link
+
+        return text
+
     def _table_to_markdown(self, table: "Table") -> str:
         """
         Converts a DOCX table to a Markdown string.
 
@@ -0,0 +1,5 @@
+---
+features:
+  - |
+    The DOCXToDocument component now has an option to include extracted hyperlink addresses in the output Documents.
+    It accepts a `link_format` parameter that can be set to "markdown" or "plain". By default, no hyperlink addresses are extracted as before.
@@ -6,7 +6,7 @@
 from io import StringIO
 
 from haystack import Document, Pipeline
-from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
+from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
 from haystack.dataclasses import ByteStream
 
 
@@ -33,36 +33,36 @@ def test_to_dict(self):
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "none"},
         }
 
     def test_to_dict_custom_parameters(self):
-        converter = DOCXToDocument(table_format="markdown")
+        converter = DOCXToDocument(table_format="markdown", link_format="markdown")
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
+            "init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
         }
 
-        converter = DOCXToDocument(table_format="csv")
+        converter = DOCXToDocument(table_format="csv", link_format="plain")
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
         }
 
-        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
+        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN, link_format=DOCXLinkFormat.MARKDOWN)
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
+            "init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
         }
 
-        converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
+        converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.PLAIN)
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
         }
 
     def test_from_dict(self):
@@ -76,10 +76,11 @@ def test_from_dict(self):
     def test_from_dict_custom_parameters(self):
         data = {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "markdown"},
+            "init_parameters": {"table_format": "markdown", "link_format": "markdown"},
         }
         converter = DOCXToDocument.from_dict(data)
         assert converter.table_format == DOCXTableFormat.MARKDOWN
+        assert converter.link_format == DOCXLinkFormat.MARKDOWN
 
     def test_from_dict_invalid_table_format(self):
         data = {
@@ -397,3 +398,53 @@ def test_document_with_docx_metadata_to_dict(self):
         # check it is JSON serializable
         json_str = json.dumps(doc.to_dict(flatten=False))
         assert json.loads(json_str) == doc.to_dict(flatten=False)
+
+    def test_link_format_initialization(self):
+        converter = DOCXToDocument(link_format="markdown")
+        assert converter.link_format == DOCXLinkFormat.MARKDOWN
+
+        converter = DOCXToDocument(link_format=DOCXLinkFormat.PLAIN)
+        assert converter.link_format == DOCXLinkFormat.PLAIN
+
+    def test_link_format_invalid(self):
+        with pytest.raises(ValueError, match="Unknown link format 'invalid_format'"):
+            DOCXToDocument(link_format="invalid_format")
+
+    @pytest.mark.parametrize("link_format", ["markdown", "plain"])
+    def test_link_extraction(self, test_files_path, link_format):
+        docx_converter = DOCXToDocument(link_format=link_format)
+        paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        if link_format == "markdown":
+            assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
+        else:  # plain format
+            assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
+
+    @pytest.mark.parametrize("link_format", ["markdown", "plain"])
+    def test_link_extraction_page_break(self, test_files_path, link_format):
+        docx_converter = DOCXToDocument(link_format=link_format)
+        paths = [test_files_path / "docx" / "sample_docx_with_links.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        if link_format == "markdown":
+            assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
+            assert "[of](https://en.wikipedia.org/wiki/OF)" in content
+            assert "[charge](https://en.wikipedia.org/wiki/Charge)" in content
+            assert "[disambiguation link](https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
+        else:  # plain format
+            assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
+            assert "of (https://en.wikipedia.org/wiki/OF)" in content
+            assert "charge (https://en.wikipedia.org/wiki/Charge)" in content
+            assert "disambiguation link (https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
+
+    def test_no_link_extraction(self, test_files_path):
+        docx_converter = DOCXToDocument()
+        paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        assert "[PDF](https://en.wikipedia.org/wiki/PDF)" not in content
+        assert "PDF (https://en.wikipedia.org/wiki/PDF)" not in content