diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..c79297cf7 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -35,6 +35,7 @@ ImageConverter, AudioConverter, OutlookMsgConverter, + EmlConverter, ZipConverter, EpubConverter, DocumentIntelligenceConverter, @@ -200,6 +201,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) + self.register_converter(EmlConverter(markitdown=self)) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..4d95daf4f 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -16,6 +16,7 @@ from ._image_converter import ImageConverter from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter +from ._eml_converter import EmlConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import ( DocumentIntelligenceConverter, @@ -40,6 +41,7 @@ "ImageConverter", "AudioConverter", "OutlookMsgConverter", + "EmlConverter", "ZipConverter", "DocumentIntelligenceConverter", "DocumentIntelligenceFileType", diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py new file mode 100644 index 000000000..c50e2dda4 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -0,0 +1,259 @@ +import email +import email.message +import email.policy +import io +import os +import re +import warnings +from typing import TYPE_CHECKING, Any, BinaryIO + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import FileConversionException, UnsupportedFormatException +from .._stream_info import StreamInfo + +if TYPE_CHECKING: + from .._markitdown import MarkItDown + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "message/rfc822", +] + +ACCEPTED_FILE_EXTENSIONS = [".eml"] + + +class EmlConverter(DocumentConverter): + """Converts EML (RFC 822) email files to markdown. + + Extracts email headers, body content, and optionally converts attachments + by passing them back through the MarkItDown converter pipeline. + + Attachment conversion requires a MarkItDown instance, which is injected + automatically when the converter is registered via ``enable_builtins()``. + If no MarkItDown instance is provided, attachments are listed by filename + without being converted. + + Example output:: + + # Email Message + + **From:** sender@example.com + **To:** recipient@example.com + **Subject:** Q1 Report + + ## Content + + Please find the Q1 report attached. + + ## Attachments + + ### Q1_Report.xlsx + + ## Sheet1 + | Quarter | Revenue | + |---------|---------| + | Q1 2026 | 1.2M | + """ + + def __init__(self, *, markitdown: "MarkItDown | None" = None) -> None: + super().__init__() + self._markitdown = markitdown + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + raw_bytes = file_stream.read() + msg = email.message_from_bytes(raw_bytes, policy=email.policy.default) + + md_content = "# Email Message\n\n" + + headers = { + "From": msg.get("From", ""), + "To": msg.get("To", ""), + "Cc": msg.get("Cc", ""), + "Subject": msg.get("Subject", ""), + "Date": msg.get("Date", ""), + } + + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + body = self._get_body(msg) + if body: + md_content += body + + attachments = self._get_attachments(msg) + if attachments: + md_content += "\n\n## Attachments\n\n" + for filename, content in attachments: + md_content += f"### {filename}\n\n" + md_content += content + "\n\n" + + return DocumentConverterResult( + markdown=md_content.strip(), + title=headers.get("Subject") or None, + ) + + def _get_body(self, msg: email.message.Message) -> str: + """Extract the body from the email message. + + Prefers text/plain. Falls back to text/html with HTML tag stripping. + Skips parts that are attachments. + """ + if msg.is_multipart(): + plain_part = None + html_part = None + for part in msg.walk(): + # Skip attachments + disposition = part.get("Content-Disposition", "") + if "attachment" in disposition.lower(): + continue + + content_type = part.get_content_type() + if content_type == "text/plain" and plain_part is None: + plain_part = part + elif content_type == "text/html" and html_part is None: + html_part = part + + if plain_part is not None: + return self._decode_part(plain_part) + elif html_part is not None: + return self._strip_html(self._decode_part(html_part)) + else: + content_type = msg.get_content_type() + body = self._decode_payload(msg) + if content_type == "text/html": + return self._strip_html(body) + return body + + return "" + + def _get_attachments(self, msg: email.message.Message) -> list[tuple[str, str]]: + """Extract and convert email attachments. + + Iterates over all MIME parts with Content-Disposition: attachment. + If a MarkItDown instance is available, each attachment is passed through + the converter pipeline. If not, the attachment is listed by filename with + a note that conversion requires a MarkItDown instance. + + Returns a list of (filename, markdown_content) tuples. + """ + results: list[tuple[str, str]] = [] + + if not msg.is_multipart(): + return results + + for part in msg.walk(): + disposition = part.get("Content-Disposition", "") + if "attachment" not in disposition.lower(): + continue + + filename = part.get_filename() or "attachment" + payload = part.get_payload(decode=True) + + if payload is None: + continue + + if self._markitdown is None: + results.append( + ( + filename, + "*Attachment present but not converted. " + "Pass a MarkItDown instance to EmlConverter to enable attachment conversion.*", + ) + ) + continue + + ext = os.path.splitext(filename)[1].lower() + try: + attachment_stream = io.BytesIO(payload) + attachment_info = StreamInfo( + extension=ext, + filename=filename, + ) + result = self._markitdown.convert_stream( + stream=attachment_stream, + stream_info=attachment_info, + ) + if result is not None and result.markdown: + results.append((filename, result.markdown)) + else: + results.append( + (filename, "*Attachment converted but produced no content.*") + ) + except UnsupportedFormatException: + results.append( + ( + filename, + f"*Attachment not converted: unsupported format (`{ext}`).*", + ) + ) + except FileConversionException as e: + results.append((filename, f"*Attachment not converted: {e}*")) + except Exception as e: + warnings.warn( + f"Unexpected error converting attachment '{filename}': {e}", + stacklevel=2, + ) + results.append( + (filename, "*Attachment not converted due to an unexpected error.*") + ) + + return results + + def _decode_part(self, part: email.message.Message) -> str: + """Decode a MIME part's payload to a string.""" + payload = part.get_payload(decode=True) + if payload is None: + return "" + charset = part.get_content_charset() or "utf-8" + try: + return payload.decode(charset).strip() + except (UnicodeDecodeError, LookupError): + return payload.decode("utf-8", errors="ignore").strip() + + def _decode_payload(self, msg: email.message.Message) -> str: + """Decode a non-multipart message payload to a string.""" + payload = msg.get_payload(decode=True) + if payload is None: + return "" + charset = msg.get_content_charset() or "utf-8" + try: + return payload.decode(charset).strip() + except (UnicodeDecodeError, LookupError): + return payload.decode("utf-8", errors="ignore").strip() + + def _strip_html(self, html: str) -> str: + """Strip HTML tags to extract plain text.""" + text = re.sub(r"", "\n", html, flags=re.IGNORECASE) + text = re.sub(r"", "\n", text, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", "", text) + text = re.sub(r" ", " ", text) + text = re.sub(r"&", "&", text) + text = re.sub(r"<", "<", text) + text = re.sub(r">", ">", text) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() diff --git a/packages/markitdown/tests/test_eml_converter.py b/packages/markitdown/tests/test_eml_converter.py new file mode 100644 index 000000000..908f581c6 --- /dev/null +++ b/packages/markitdown/tests/test_eml_converter.py @@ -0,0 +1,195 @@ +"""Tests for EmlConverter: body extraction, header parsing, and attachment conversion.""" + +import io +from email.mime.base import MIMEBase +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from unittest.mock import MagicMock + + +from markitdown import StreamInfo +from markitdown.converters import EmlConverter + + +def _make_simple_eml( + subject="Test Subject", + from_addr="sender@example.com", + to_addr="recipient@example.com", + body="Hello, this is the email body.", +) -> bytes: + """Build a plain-text EML message as bytes.""" + msg = MIMEText(body, "plain", "utf-8") + msg["Subject"] = subject + msg["From"] = from_addr + msg["To"] = to_addr + msg["Date"] = "Sat, 04 Apr 2026 12:00:00 +0000" + return msg.as_bytes() + + +def _make_multipart_eml(body_text: str, attachments: list[tuple[str, bytes]]) -> bytes: + """Build a multipart EML with a plain-text body and file attachments.""" + msg = MIMEMultipart() + msg["Subject"] = "Multipart Test" + msg["From"] = "sender@example.com" + msg["To"] = "recipient@example.com" + + msg.attach(MIMEText(body_text, "plain", "utf-8")) + + for filename, data in attachments: + part = MIMEBase("application", "octet-stream") + part.set_payload(data) + part.add_header("Content-Disposition", "attachment", filename=filename) + msg.attach(part) + + return msg.as_bytes() + + +class TestEmlConverterBody: + def test_simple_plain_text_email(self): + eml = _make_simple_eml(body="This is the body.") + converter = EmlConverter() + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert "# Email Message" in result.markdown + assert "**From:** sender@example.com" in result.markdown + assert "**To:** recipient@example.com" in result.markdown + assert "**Subject:** Test Subject" in result.markdown + assert "This is the body." in result.markdown + + def test_title_is_subject(self): + eml = _make_simple_eml(subject="My Subject") + converter = EmlConverter() + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert result.title == "My Subject" + + def test_empty_subject_gives_none_title(self): + eml = _make_simple_eml(subject="") + converter = EmlConverter() + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert result.title is None + + def test_missing_cc_not_shown(self): + eml = _make_simple_eml() + converter = EmlConverter() + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert "**Cc:**" not in result.markdown + + def test_accepts_eml_extension(self): + converter = EmlConverter() + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(extension=".eml"), + ) + + def test_accepts_rfc822_mimetype(self): + converter = EmlConverter() + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(mimetype="message/rfc822"), + ) + + def test_rejects_unrelated_extension(self): + converter = EmlConverter() + assert not converter.accepts( + io.BytesIO(b""), + StreamInfo(extension=".pdf"), + ) + + +class TestEmlConverterAttachments: + def test_no_attachments_no_section(self): + eml = _make_simple_eml() + converter = EmlConverter() + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert "## Attachments" not in result.markdown + + def test_attachment_without_markitdown_shows_notice(self): + eml = _make_multipart_eml("Body text.", [("report.pdf", b"%PDF-fake")]) + converter = EmlConverter(markitdown=None) + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert "## Attachments" in result.markdown + assert "report.pdf" in result.markdown + assert "Pass a MarkItDown instance" in result.markdown + + def test_attachment_converted_with_markitdown(self): + mock_md = MagicMock() + mock_result = MagicMock() + mock_result.markdown = "| col1 | col2 |\n|------|------|\n| a | b |" + mock_md.convert_stream.return_value = mock_result + + eml = _make_multipart_eml("See attached.", [("data.csv", b"col1,col2\na,b")]) + converter = EmlConverter(markitdown=mock_md) + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + + assert "## Attachments" in result.markdown + assert "### data.csv" in result.markdown + assert "col1 | col2" in result.markdown + mock_md.convert_stream.assert_called_once() + + def test_unsupported_attachment_shows_note(self): + from markitdown._exceptions import UnsupportedFormatException + + mock_md = MagicMock() + mock_md.convert_stream.side_effect = UnsupportedFormatException("not supported") + + eml = _make_multipart_eml("Body.", [("weird.xyz", b"\x00\x01\x02")]) + converter = EmlConverter(markitdown=mock_md) + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + + assert "weird.xyz" in result.markdown + assert "unsupported format" in result.markdown + + def test_multiple_attachments_all_listed(self): + mock_md = MagicMock() + mock_md.convert_stream.return_value = MagicMock(markdown="converted content") + + attachments = [("first.txt", b"hello"), ("second.txt", b"world")] + eml = _make_multipart_eml("Body.", attachments) + converter = EmlConverter(markitdown=mock_md) + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + + assert "### first.txt" in result.markdown + assert "### second.txt" in result.markdown + assert mock_md.convert_stream.call_count == 2 + + def test_body_excludes_attachment_parts(self): + """Body extraction should not include attachment MIME parts as text.""" + eml = _make_multipart_eml( + "This is the real body.", + [("attach.txt", b"This should not be in the body.")], + ) + converter = EmlConverter() + result = converter.convert( + io.BytesIO(eml), + StreamInfo(extension=".eml"), + ) + assert "This is the real body." in result.markdown + # Attachment content should not bleed into the body section + content_section = result.markdown.split("## Content")[1] + assert "This should not be in the body." not in content_section