From ea9af600869b0f0a2b3336ba67be9640a59422a7 Mon Sep 17 00:00:00 2001 From: Luis Lopes Date: Fri, 28 Nov 2025 18:52:08 +0100 Subject: [PATCH 1/4] add eml converter --- .../markitdown/src/markitdown/_markitdown.py | 2 + .../markitdown/converters/_eml_converter.py | 92 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 packages/markitdown/src/markitdown/converters/_eml_converter.py diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 702b10c68..55798347c 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -39,6 +39,7 @@ EpubConverter, DocumentIntelligenceConverter, CsvConverter, + EmlConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -195,6 +196,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) + self.register_converter(EmlConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py new file mode 100644 index 000000000..b648990f4 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -0,0 +1,92 @@ +from typing import Any +from ._base import DocumentConverter, DocumentConverterResult +from email import policy +from email.parser import Parser +from email.utils import parseaddr + +class EmlConverter(DocumentConverter): + """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info.""" + + def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: + """Convert an EML file to markdown. + Args: + local_path: Path to the EML file + **kwargs: Additional arguments (unused) + Returns: + DocumentConverterResult containing the converted markdown + """ + # Check if this is an EML file + file_ext = kwargs.get("file_extension", "").lower() + if not file_ext.endswith(".eml"): + return None + + with open(local_path, "r", encoding="utf-8") as fp: + # Use policy=default to handle RFC compliant emails + msg = Parser(policy=policy.default).parse(fp) + + # Initialize result with email subject as title + result = DocumentConverterResult(title=msg.get("subject", "Untitled Email")) + + # Build markdown content + md_parts = [] + + # Add email headers + md_parts.append("## Email Headers\n") + + # From and To in a more readable format + from_name, from_email = parseaddr(msg.get("from", "")) + to_name, to_email = parseaddr(msg.get("to", "")) + + md_parts.append( + f"**From:** {from_name} <{from_email}>" + if from_name + else f"**From:** {from_email}" + ) + md_parts.append( + f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}" + ) + md_parts.append(f"**Subject:** {msg.get('subject', '')}") + md_parts.append(f"**Date:** {msg.get('date', '')}") + + # Add CC if present + if msg.get("cc"): + md_parts.append(f"**CC:** {msg.get('cc')}") + + md_parts.append("\n## Email Content\n") + + # Handle the email body + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + md_parts.append(part.get_content()) + elif part.get_content_type() == "text/html": + # If we have HTML content but no plain text, we could convert HTML to markdown here + # For now, we'll just note it's HTML content + if not any( + p.get_content_type() == "text/plain" for p in msg.walk() + ): + md_parts.append(part.get_content()) + else: + md_parts.append(msg.get_content()) + + # List attachments if any + attachments = [] + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_disposition() == "attachment": + filename = part.get_filename() + if filename: + size = len(part.get_content()) + mime_type = part.get_content_type() + attachments.append( + f"- {filename} ({mime_type}, {size:,} bytes)" + ) + + if attachments: + md_parts.append("\n## Attachments\n") + md_parts.extend(attachments) + + # Combine all parts + result.text_content = "\n".join(md_parts) + + return result \ No newline at end of file From 027fc2a4dc3f1f822647120b83b52155c35d415a Mon Sep 17 00:00:00 2001 From: Luis Lopes Date: Fri, 28 Nov 2025 19:22:24 +0100 Subject: [PATCH 2/4] add missing imports --- packages/markitdown/src/markitdown/converters/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..650ad9df2 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -17,6 +17,7 @@ from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter +from ._eml_converter import EmlConverter from ._doc_intel_converter import ( DocumentIntelligenceConverter, DocumentIntelligenceFileType, @@ -45,4 +46,5 @@ "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", + "EmlConverter", ] From 5e8cf881c16262c3d4bb7c3698a7cc66c9f62114 Mon Sep 17 00:00:00 2001 From: Luis Lopes Date: Fri, 28 Nov 2025 19:27:13 +0100 Subject: [PATCH 3/4] fix path for base converter --- packages/markitdown/src/markitdown/converters/_eml_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py index b648990f4..da03724e5 100644 --- a/packages/markitdown/src/markitdown/converters/_eml_converter.py +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -1,5 +1,5 @@ from typing import Any -from ._base import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from email import policy from email.parser import Parser from email.utils import parseaddr From c6c20ddbf349849a35817f4de0ca851da7e5bb94 Mon Sep 17 00:00:00 2001 From: Luis Lopes Date: Fri, 28 Nov 2025 19:49:45 +0100 Subject: [PATCH 4/4] Update EmlConverter to latest standard --- .../markitdown/converters/_eml_converter.py | 176 ++++++++++++------ packages/markitdown/tests/test_files/test.eml | 33 ++++ 2 files changed, 151 insertions(+), 58 deletions(-) create mode 100644 packages/markitdown/tests/test_files/test.eml diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py index da03724e5..e1dcfe2e8 100644 --- a/packages/markitdown/src/markitdown/converters/_eml_converter.py +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -1,82 +1,140 @@ -from typing import Any -from .._base_converter import DocumentConverter, DocumentConverterResult +from typing import Any, BinaryIO, List, Tuple + from email import policy -from email.parser import Parser -from email.utils import parseaddr +from email.parser import BytesParser +from email.utils import getaddresses + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "message/", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".eml", +] + class EmlConverter(DocumentConverter): """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info.""" - def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: - """Convert an EML file to markdown. - Args: - local_path: Path to the EML file - **kwargs: Additional arguments (unused) - Returns: - DocumentConverterResult containing the converted markdown - """ - # Check if this is an EML file - file_ext = kwargs.get("file_extension", "").lower() - if not file_ext.endswith(".eml"): - return None - - with open(local_path, "r", encoding="utf-8") as fp: - # Use policy=default to handle RFC compliant emails - msg = Parser(policy=policy.default).parse(fp) - - # Initialize result with email subject as title - result = DocumentConverterResult(title=msg.get("subject", "Untitled Email")) + def accepts( + self, + file_stream: BinaryIO, # noqa: ARG002 - required by interface + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + # Check the extension and mimetype + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, # noqa: ARG002 - kept for interface compatibility + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + """Convert an EML message to markdown.""" + _ = kwargs # Currently unused + + # Read the full message from the binary stream and parse it + raw_bytes = file_stream.read() + msg = BytesParser(policy=policy.default).parsebytes(raw_bytes) # Build markdown content - md_parts = [] + md_parts: List[str] = [] # Add email headers md_parts.append("## Email Headers\n") - # From and To in a more readable format - from_name, from_email = parseaddr(msg.get("from", "")) - to_name, to_email = parseaddr(msg.get("to", "")) - - md_parts.append( - f"**From:** {from_name} <{from_email}>" - if from_name - else f"**From:** {from_email}" - ) - md_parts.append( - f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}" - ) - md_parts.append(f"**Subject:** {msg.get('subject', '')}") - md_parts.append(f"**Date:** {msg.get('date', '')}") - - # Add CC if present - if msg.get("cc"): - md_parts.append(f"**CC:** {msg.get('cc')}") + # Helper to format address headers that can contain multiple addresses + def _format_address_header(header_name: str) -> Tuple[str, str]: + raw_values = msg.get_all(header_name, []) + if not raw_values: + return header_name, "" + + addresses = getaddresses(raw_values) + formatted = [] + for name, addr in addresses: + if name and addr: + formatted.append(f"{name} <{addr}>") + elif addr: + formatted.append(addr) + return header_name, ", ".join(formatted) + + # From, To, Cc, Bcc in a readable format + for header in ["From", "To", "Cc", "Bcc"]: + key, value = _format_address_header(header) + if value: + md_parts.append(f"**{key}:** {value}") + + # Other common headers + subject = msg.get("Subject", "") + if subject: + md_parts.append(f"**Subject:** {subject}") + + date = msg.get("Date", "") + if date: + md_parts.append(f"**Date:** {date}") md_parts.append("\n## Email Content\n") - # Handle the email body + # Prefer plain text body; fall back to HTML if no plain text part exists + body_text: List[str] = [] + has_text_plain = False + if msg.is_multipart(): + # First pass: check if there is any text/plain part for part in msg.walk(): if part.get_content_type() == "text/plain": - md_parts.append(part.get_content()) - elif part.get_content_type() == "text/html": - # If we have HTML content but no plain text, we could convert HTML to markdown here - # For now, we'll just note it's HTML content - if not any( - p.get_content_type() == "text/plain" for p in msg.walk() - ): - md_parts.append(part.get_content()) + has_text_plain = True + break + + for part in msg.walk(): + content_type = part.get_content_type() + disposition = part.get_content_disposition() + + # Skip attachments when extracting the main body + if disposition == "attachment": + continue + + if content_type == "text/plain": + body_text.append(part.get_content()) + elif content_type == "text/html" and not has_text_plain: + # If we have HTML content but no plain text, fall back to HTML + body_text.append(part.get_content()) else: - md_parts.append(msg.get_content()) + # Single-part message + content_type = msg.get_content_type() + if content_type in ("text/plain", "text/html", "text/rfc822-headers"): + body_text.append(msg.get_content()) + + if body_text: + md_parts.append("\n".join(body_text)) - # List attachments if any - attachments = [] + # List attachments, if any + attachments: List[str] = [] if msg.is_multipart(): for part in msg.walk(): if part.get_content_disposition() == "attachment": filename = part.get_filename() if filename: - size = len(part.get_content()) + try: + payload = part.get_content() + size = len(payload) if isinstance(payload, (bytes, str)) else 0 + except Exception: + size = 0 mime_type = part.get_content_type() attachments.append( f"- {filename} ({mime_type}, {size:,} bytes)" @@ -86,7 +144,9 @@ def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: md_parts.append("\n## Attachments\n") md_parts.extend(attachments) - # Combine all parts - result.text_content = "\n".join(md_parts) + markdown = "\n".join(md_parts).strip() - return result \ No newline at end of file + return DocumentConverterResult( + markdown=markdown, + title=subject or None, + ) \ No newline at end of file diff --git a/packages/markitdown/tests/test_files/test.eml b/packages/markitdown/tests/test_files/test.eml new file mode 100644 index 000000000..15f6b85cf --- /dev/null +++ b/packages/markitdown/tests/test_files/test.eml @@ -0,0 +1,33 @@ +Content-Type: multipart/mixed; boundary="===============8484938434343225034==" +MIME-Version: 1.0 +Subject: Test Email Document +From: John Doe +To: Jane Smith +Date: Wed, 18 Dec 2024 10:00:00 +0000 +CC: cc.person@example.com + +--===============8484938434343225034== +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + + +This is a test email with multiple parts. + +It contains: +- Plain text content +- An attachment +- Various headers + +Best regards, +John Doe + +--===============8484938434343225034== +Content-Type: application/txt +MIME-Version: 1.0 +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; filename="test.txt" + +VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA== + +--===============8484938434343225034==--