Skip to content

Commit e64db61

Browse files
authored
feat: include hyperlink addresses in DOCXToDocument output (#9109)
* add DOCXLinkFormat * handle page breaks * add sample docx files * make no link extraction the default * reno * docstring and comment
1 parent f9cce8b commit e64db61

5 files changed

Lines changed: 143 additions & 19 deletions

File tree

haystack/components/converters/docx.py

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
import docx
2323
from docx.document import Document as DocxDocument
2424
from docx.table import Table
25+
from docx.text.hyperlink import Hyperlink
2526
from docx.text.paragraph import Paragraph
27+
from docx.text.run import Run
2628
from lxml.etree import _Comment
2729

2830

@@ -89,6 +91,31 @@ def from_str(string: str) -> "DOCXTableFormat":
8991
return table_format
9092

9193

94+
class DOCXLinkFormat(Enum):
95+
"""
96+
Supported formats for storing DOCX link information in a Document.
97+
"""
98+
99+
MARKDOWN = "markdown"
100+
PLAIN = "plain"
101+
NONE = "none"
102+
103+
def __str__(self):
104+
return self.value
105+
106+
@staticmethod
107+
def from_str(string: str) -> "DOCXLinkFormat":
108+
"""
109+
Convert a string to a DOCXLinkFormat enum.
110+
"""
111+
enum_map = {e.value: e for e in DOCXLinkFormat}
112+
link_format = enum_map.get(string.lower())
113+
if link_format is None:
114+
msg = f"Unknown link format '{string}'. Supported formats are: {list(enum_map.keys())}"
115+
raise ValueError(msg)
116+
return link_format
117+
118+
92119
@component
93120
class DOCXToDocument:
94121
"""
@@ -99,28 +126,38 @@ class DOCXToDocument:
99126
100127
Usage example:
101128
```python
102-
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
129+
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
103130
104-
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
131+
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN)
105132
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
106133
documents = results["documents"]
107134
print(documents[0].content)
108135
# 'This is a text from the DOCX file.'
109136
```
110137
"""
111138

112-
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
139+
def __init__(
140+
self,
141+
table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV,
142+
link_format: Union[str, DOCXLinkFormat] = DOCXLinkFormat.NONE,
143+
store_full_path: bool = False,
144+
):
113145
"""
114146
Create a DOCXToDocument component.
115147
116148
:param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
117-
DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
149+
DOCXTableFormat.CSV, "markdown", or "csv".
150+
:param link_format: The format for link output. Can be either:
151+
DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address),
152+
DOCXLinkFormat.PLAIN or "plain" to get text (address),
153+
DOCXLinkFormat.NONE or "none" to get text without links.
118154
:param store_full_path:
119155
If True, the full path of the file is stored in the metadata of the document.
120156
If False, only the file name is stored.
121157
"""
122158
docx_import.check()
123159
self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
160+
self.link_format = DOCXLinkFormat.from_str(link_format) if isinstance(link_format, str) else link_format
124161
self.store_full_path = store_full_path
125162

126163
def to_dict(self) -> Dict[str, Any]:
@@ -130,7 +167,12 @@ def to_dict(self) -> Dict[str, Any]:
130167
:returns:
131168
Dictionary with serialized data.
132169
"""
133-
return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
170+
return default_to_dict(
171+
self,
172+
table_format=str(self.table_format),
173+
link_format=str(self.link_format),
174+
store_full_path=self.store_full_path,
175+
)
134176

135177
@classmethod
136178
def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
@@ -144,6 +186,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
144186
"""
145187
if "table_format" in data["init_parameters"]:
146188
data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
189+
if "link_format" in data["init_parameters"]:
190+
data["init_parameters"]["link_format"] = DOCXLinkFormat.from_str(data["init_parameters"]["link_format"])
147191
return default_from_dict(cls, data)
148192

149193
@component.output_types(documents=List[Document])
@@ -218,7 +262,7 @@ def _extract_elements(self, document: "DocxDocument") -> List[str]:
218262
if paragraph.contains_page_break:
219263
para_text = self._process_paragraph_with_page_breaks(paragraph)
220264
else:
221-
para_text = paragraph.text
265+
para_text = self._process_links_in_paragraph(paragraph)
222266
elements.append(para_text)
223267
elif element.tag.endswith("tbl"):
224268
table = docx.table.Table(element, document)
@@ -244,18 +288,42 @@ def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
244288
# Can only extract text from first paragraph page break, unfortunately
245289
if pb_index == 0:
246290
if page_break.preceding_paragraph_fragment:
247-
para_text += page_break.preceding_paragraph_fragment.text
291+
para_text += self._process_links_in_paragraph(page_break.preceding_paragraph_fragment)
248292
para_text += "\f"
249293
if page_break.following_paragraph_fragment:
250294
# following_paragraph_fragment contains all text for remainder of paragraph.
251295
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
252296
# those later page breaks so we have to add them at end of text in the `else` block below.
253297
# This is not ideal, but this case should be very rare and this is likely good enough.
254-
para_text += page_break.following_paragraph_fragment.text
298+
para_text += self._process_links_in_paragraph(page_break.following_paragraph_fragment)
255299
else:
256300
para_text += "\f"
257301
return para_text
258302

303+
def _process_links_in_paragraph(self, paragraph: "Paragraph") -> str:
304+
"""
305+
Processes links in a paragraph and formats them according to the specified link format.
306+
307+
:param paragraph: The DOCX paragraph to process.
308+
:returns: A string with links formatted according to the specified format.
309+
"""
310+
if self.link_format == DOCXLinkFormat.NONE:
311+
return paragraph.text
312+
text = ""
313+
# Iterate over all hyperlinks and other content in the paragraph
314+
# https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content
315+
for content in paragraph.iter_inner_content():
316+
if isinstance(content, Run):
317+
text += content.text
318+
elif isinstance(content, Hyperlink):
319+
if self.link_format == DOCXLinkFormat.MARKDOWN:
320+
formatted_link = f"[{content.text}]({content.address})"
321+
else: # PLAIN format
322+
formatted_link = f"{content.text} ({content.address})"
323+
text += formatted_link
324+
325+
return text
326+
259327
def _table_to_markdown(self, table: "Table") -> str:
260328
"""
261329
Converts a DOCX table to a Markdown string.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
features:
3+
- |
4+
The DOCXToDocument component now has an option to include extracted hyperlink addresses in the output Documents.
5+
It accepts a `link_format` parameter that can be set to "markdown" or "plain". By default, no hyperlink addresses are extracted as before.

test/components/converters/test_docx_file_to_document.py

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from io import StringIO
77

88
from haystack import Document, Pipeline
9-
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
9+
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
1010
from haystack.dataclasses import ByteStream
1111

1212

@@ -33,36 +33,36 @@ def test_to_dict(self):
3333
data = converter.to_dict()
3434
assert data == {
3535
"type": "haystack.components.converters.docx.DOCXToDocument",
36-
"init_parameters": {"store_full_path": False, "table_format": "csv"},
36+
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "none"},
3737
}
3838

3939
def test_to_dict_custom_parameters(self):
40-
converter = DOCXToDocument(table_format="markdown")
40+
converter = DOCXToDocument(table_format="markdown", link_format="markdown")
4141
data = converter.to_dict()
4242
assert data == {
4343
"type": "haystack.components.converters.docx.DOCXToDocument",
44-
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
44+
"init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
4545
}
4646

47-
converter = DOCXToDocument(table_format="csv")
47+
converter = DOCXToDocument(table_format="csv", link_format="plain")
4848
data = converter.to_dict()
4949
assert data == {
5050
"type": "haystack.components.converters.docx.DOCXToDocument",
51-
"init_parameters": {"store_full_path": False, "table_format": "csv"},
51+
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
5252
}
5353

54-
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
54+
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN, link_format=DOCXLinkFormat.MARKDOWN)
5555
data = converter.to_dict()
5656
assert data == {
5757
"type": "haystack.components.converters.docx.DOCXToDocument",
58-
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
58+
"init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
5959
}
6060

61-
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
61+
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.PLAIN)
6262
data = converter.to_dict()
6363
assert data == {
6464
"type": "haystack.components.converters.docx.DOCXToDocument",
65-
"init_parameters": {"store_full_path": False, "table_format": "csv"},
65+
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
6666
}
6767

6868
def test_from_dict(self):
@@ -76,10 +76,11 @@ def test_from_dict(self):
7676
def test_from_dict_custom_parameters(self):
7777
data = {
7878
"type": "haystack.components.converters.docx.DOCXToDocument",
79-
"init_parameters": {"table_format": "markdown"},
79+
"init_parameters": {"table_format": "markdown", "link_format": "markdown"},
8080
}
8181
converter = DOCXToDocument.from_dict(data)
8282
assert converter.table_format == DOCXTableFormat.MARKDOWN
83+
assert converter.link_format == DOCXLinkFormat.MARKDOWN
8384

8485
def test_from_dict_invalid_table_format(self):
8586
data = {
@@ -397,3 +398,53 @@ def test_document_with_docx_metadata_to_dict(self):
397398
# check it is JSON serializable
398399
json_str = json.dumps(doc.to_dict(flatten=False))
399400
assert json.loads(json_str) == doc.to_dict(flatten=False)
401+
402+
def test_link_format_initialization(self):
403+
converter = DOCXToDocument(link_format="markdown")
404+
assert converter.link_format == DOCXLinkFormat.MARKDOWN
405+
406+
converter = DOCXToDocument(link_format=DOCXLinkFormat.PLAIN)
407+
assert converter.link_format == DOCXLinkFormat.PLAIN
408+
409+
def test_link_format_invalid(self):
410+
with pytest.raises(ValueError, match="Unknown link format 'invalid_format'"):
411+
DOCXToDocument(link_format="invalid_format")
412+
413+
@pytest.mark.parametrize("link_format", ["markdown", "plain"])
414+
def test_link_extraction(self, test_files_path, link_format):
415+
docx_converter = DOCXToDocument(link_format=link_format)
416+
paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
417+
output = docx_converter.run(sources=paths)
418+
content = output["documents"][0].content
419+
420+
if link_format == "markdown":
421+
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
422+
else: # plain format
423+
assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
424+
425+
@pytest.mark.parametrize("link_format", ["markdown", "plain"])
426+
def test_link_extraction_page_break(self, test_files_path, link_format):
427+
docx_converter = DOCXToDocument(link_format=link_format)
428+
paths = [test_files_path / "docx" / "sample_docx_with_links.docx"]
429+
output = docx_converter.run(sources=paths)
430+
content = output["documents"][0].content
431+
432+
if link_format == "markdown":
433+
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
434+
assert "[of](https://en.wikipedia.org/wiki/OF)" in content
435+
assert "[charge](https://en.wikipedia.org/wiki/Charge)" in content
436+
assert "[disambiguation link](https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
437+
else: # plain format
438+
assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
439+
assert "of (https://en.wikipedia.org/wiki/OF)" in content
440+
assert "charge (https://en.wikipedia.org/wiki/Charge)" in content
441+
assert "disambiguation link (https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
442+
443+
def test_no_link_extraction(self, test_files_path):
444+
docx_converter = DOCXToDocument()
445+
paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
446+
output = docx_converter.run(sources=paths)
447+
content = output["documents"][0].content
448+
449+
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" not in content
450+
assert "PDF (https://en.wikipedia.org/wiki/PDF)" not in content
15.3 KB
Binary file not shown.
14.4 KB
Binary file not shown.

0 commit comments

Comments
 (0)