Skip to content

Commit 3fbcac7

Browse files
feat: extract Markdown frontmatter metadata (#11615)
Co-authored-by: gyx09212214-prog <243787584+gyx09212214-prog@users.noreply.github.com>
1 parent 38cf047 commit 3fbcac7

4 files changed

Lines changed: 163 additions & 2 deletions

File tree

docs-website/docs/pipeline-components/converters/markdowntodocument.mdx

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ The `MarkdownToDocument` component converts Markdown files into documents. You c
2828

2929
When you initialize the component, you can optionally turn off progress bars by setting `progress_bar` to `False`. If you want to convert the contents of tables into a single line, you can enable that through the `table_to_single_line` parameter.
3030

31+
If your Markdown files start with YAML frontmatter, set `extract_frontmatter=True` to move that data into `Document.meta` and remove it from the converted document content. Metadata passed through the `meta` input takes precedence over frontmatter keys.
32+
3133
## Usage
3234

3335
You need to install `markdown-it-py` and `mdit_plain packages` to use the `MarkdownToDocument` component:
@@ -46,6 +48,31 @@ converter = MarkdownToDocument()
4648
docs = converter.run(sources=Path("my_file.md"))
4749
```
4850

51+
### With YAML frontmatter
52+
53+
Given `equity_note.md`:
54+
55+
```markdown
56+
---
57+
ticker: AAPL
58+
source: earnings_call
59+
date: 2026-06-12
60+
---
61+
62+
# Thesis
63+
Revenue guidance improved.
64+
```
65+
66+
```python
67+
from haystack.components.converters import MarkdownToDocument
68+
69+
converter = MarkdownToDocument(extract_frontmatter=True)
70+
71+
docs = converter.run(sources=["equity_note.md"])["documents"]
72+
print(docs[0].meta["ticker"])
73+
print(docs[0].content)
74+
```
75+
4976
### In a pipeline
5077

5178
```python

haystack/components/converters/markdown.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
import json
56
import os
7+
import re
68
from pathlib import Path
79
from typing import Any
810

11+
import yaml
912
from tqdm import tqdm
1013

1114
from haystack import Document, component, logging
@@ -20,6 +23,8 @@
2023

2124
logger = logging.getLogger(__name__)
2225

26+
_FRONTMATTER_PATTERN = re.compile(r"\A---[ \t]*\r?\n(?P<frontmatter>.*?)(?:\r?\n)---[ \t]*(?:\r?\n|$)", re.DOTALL)
27+
2328

2429
@component
2530
class MarkdownToDocument:
@@ -43,7 +48,12 @@ class MarkdownToDocument:
4348
"""
4449

4550
def __init__(
46-
self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False
51+
self,
52+
table_to_single_line: bool = False,
53+
progress_bar: bool = True,
54+
store_full_path: bool = False,
55+
*,
56+
extract_frontmatter: bool = False,
4757
) -> None:
4858
"""
4959
Create a MarkdownToDocument component.
@@ -55,12 +65,16 @@ def __init__(
5565
:param store_full_path:
5666
If True, the full path of the file is stored in the metadata of the document.
5767
If False, only the file name is stored.
68+
:param extract_frontmatter:
69+
If True, YAML frontmatter at the beginning of the Markdown file is
70+
removed from the document content and added to the document metadata.
5871
"""
5972
markdown_conversion_imports.check()
6073

6174
self.table_to_single_line = table_to_single_line
6275
self.progress_bar = progress_bar
6376
self.store_full_path = store_full_path
77+
self.extract_frontmatter = extract_frontmatter
6478

6579
@component.output_types(documents=list[Document])
6680
def run(
@@ -103,6 +117,7 @@ def run(
103117
continue
104118
try:
105119
file_content = bytestream.data.decode("utf-8")
120+
file_content, frontmatter = self._extract_frontmatter(file_content, source)
106121
text = parser.render(file_content)
107122
except Exception as conversion_e:
108123
logger.warning(
@@ -112,7 +127,7 @@ def run(
112127
)
113128
continue
114129

115-
merged_metadata = {**bytestream.meta, **metadata}
130+
merged_metadata = {**bytestream.meta, **frontmatter, **metadata}
116131

117132
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
118133
merged_metadata["file_path"] = os.path.basename(file_path)
@@ -121,3 +136,39 @@ def run(
121136
documents.append(document)
122137

123138
return {"documents": documents}
139+
140+
def _extract_frontmatter(self, file_content: str, source: str | Path | ByteStream) -> tuple[str, dict[str, Any]]:
141+
if not self.extract_frontmatter:
142+
return file_content, {}
143+
144+
match = _FRONTMATTER_PATTERN.match(file_content)
145+
if not match:
146+
return file_content, {}
147+
148+
frontmatter_text = match.group("frontmatter")
149+
try:
150+
frontmatter = json.loads(json.dumps(yaml.safe_load(frontmatter_text), default=str)) or {}
151+
except yaml.YAMLError as error:
152+
logger.warning(
153+
"Could not parse YAML frontmatter in {source}. Keeping it as content. Error: {error}",
154+
source=source,
155+
error=error,
156+
)
157+
return file_content, {}
158+
except (TypeError, ValueError) as error:
159+
logger.warning(
160+
"Could not convert YAML frontmatter in {source}. Keeping it as content. Error: {error}",
161+
source=source,
162+
error=error,
163+
)
164+
return file_content, {}
165+
166+
if not isinstance(frontmatter, dict):
167+
logger.warning(
168+
"Ignoring YAML frontmatter in {source}: expected a mapping, got {kind}.",
169+
source=source,
170+
kind=type(frontmatter).__name__,
171+
)
172+
return file_content, {}
173+
174+
return file_content[match.end() :], frontmatter
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
features:
3+
- |
4+
Added optional YAML frontmatter extraction to ``MarkdownToDocument``. When initialized with
5+
``extract_frontmatter=True``, YAML frontmatter at the beginning of a Markdown file is removed from
6+
the converted content and added to ``Document.meta``.

test/components/converters/test_markdown_to_document.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def test_init_params_default(self):
1616
converter = MarkdownToDocument()
1717
assert converter.table_to_single_line is False
1818
assert converter.progress_bar is True
19+
assert converter.extract_frontmatter is False
1920

2021
def test_init_params_custom(self):
2122
converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False, store_full_path=False)
@@ -79,6 +80,82 @@ def test_run_with_meta(self, test_files_path):
7980
assert output["documents"][0].meta["language"] == "it"
8081
assert output["documents"][1].meta["language"] == "it"
8182

83+
def test_run_extracts_yaml_frontmatter_into_metadata(self):
84+
bytestream = ByteStream(
85+
data=(
86+
b"---\n"
87+
b"ticker: AAPL\n"
88+
b"date: 2026-06-12\n"
89+
b"rating_score: 4\n"
90+
b"source: earnings_call\n"
91+
b"tags:\n"
92+
b" - guidance\n"
93+
b"---\n"
94+
b"# Thesis\n"
95+
b"Revenue guidance improved.\n"
96+
),
97+
meta={"file_path": "/tmp/aapl.md"},
98+
)
99+
100+
converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
101+
output = converter.run(sources=[bytestream])
102+
document = output["documents"][0]
103+
104+
assert "Revenue guidance improved." in document.content
105+
assert "ticker: AAPL" not in document.content
106+
assert document.meta["ticker"] == "AAPL"
107+
assert document.meta["date"] == "2026-06-12"
108+
assert document.meta["rating_score"] == 4
109+
assert document.meta["source"] == "earnings_call"
110+
assert document.meta["tags"] == ["guidance"]
111+
assert document.meta["file_path"] == "aapl.md"
112+
113+
def test_run_keeps_frontmatter_as_content_by_default(self):
114+
bytestream = ByteStream(data=b"---\nticker: AAPL\n---\n# Thesis\n")
115+
116+
converter = MarkdownToDocument(progress_bar=False)
117+
output = converter.run(sources=[bytestream])
118+
document = output["documents"][0]
119+
120+
assert "ticker: AAPL" in document.content
121+
assert "ticker" not in document.meta
122+
123+
def test_run_meta_overrides_frontmatter_metadata(self):
124+
bytestream = ByteStream(
125+
data=b"---\nticker: AAPL\nsource: filing\n---\n# Thesis\n", meta={"source": "bytestream"}
126+
)
127+
128+
converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
129+
output = converter.run(sources=[bytestream], meta={"ticker": "MSFT"})
130+
document = output["documents"][0]
131+
132+
assert document.meta["ticker"] == "MSFT"
133+
assert document.meta["source"] == "filing"
134+
135+
def test_run_keeps_malformed_frontmatter_as_content_and_logs_warning(self, caplog):
136+
bytestream = ByteStream(data=b"---\nticker: [AAPL\n---\n# Thesis\n")
137+
138+
converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
139+
with caplog.at_level(logging.WARNING):
140+
output = converter.run(sources=[bytestream])
141+
142+
document = output["documents"][0]
143+
assert "ticker: [AAPL" in document.content
144+
assert "ticker" not in document.meta
145+
assert "Could not parse YAML frontmatter" in caplog.text
146+
147+
def test_run_keeps_unserializable_frontmatter_as_content_and_logs_warning(self, caplog):
148+
bytestream = ByteStream(data=b"---\ncycle: &cycle\n - *cycle\n---\n# Thesis\n")
149+
150+
converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
151+
with caplog.at_level(logging.WARNING):
152+
output = converter.run(sources=[bytestream])
153+
154+
document = output["documents"][0]
155+
assert "cycle:" in document.content
156+
assert "cycle" not in document.meta
157+
assert "Could not convert YAML frontmatter" in caplog.text
158+
82159
@pytest.mark.integration
83160
def test_run_wrong_file_type(self, test_files_path, caplog):
84161
"""

0 commit comments

Comments
 (0)