From ea9af600869b0f0a2b3336ba67be9640a59422a7 Mon Sep 17 00:00:00 2001
From: Luis Lopes <git@luislopes.org>
Date: Fri, 28 Nov 2025 18:52:08 +0100
Subject: [PATCH 1/4] add eml converter

---
 .../markitdown/src/markitdown/_markitdown.py  |  2 +
 .../markitdown/converters/_eml_converter.py   | 92 +++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 packages/markitdown/src/markitdown/converters/_eml_converter.py

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index 702b10c68..55798347c 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -39,6 +39,7 @@
     EpubConverter,
     DocumentIntelligenceConverter,
     CsvConverter,
+    EmlConverter,
 )
 
 from ._base_converter import DocumentConverter, DocumentConverterResult
@@ -195,6 +196,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(OutlookMsgConverter())
             self.register_converter(EpubConverter())
             self.register_converter(CsvConverter())
+            self.register_converter(EmlConverter())
 
             # Register Document Intelligence converter at the top of the stack if endpoint is provided
             docintel_endpoint = kwargs.get("docintel_endpoint")
diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py
new file mode 100644
index 000000000..b648990f4
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py
@@ -0,0 +1,92 @@
+from typing import Any
+from ._base import DocumentConverter, DocumentConverterResult
+from email import policy
+from email.parser import Parser
+from email.utils import parseaddr
+
+class EmlConverter(DocumentConverter):
+    """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EML file to markdown.
+        Args:
+            local_path: Path to the EML file
+            **kwargs: Additional arguments (unused)
+        Returns:
+            DocumentConverterResult containing the converted markdown
+        """
+        # Check if this is an EML file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".eml"):
+            return None
+
+        with open(local_path, "r", encoding="utf-8") as fp:
+            # Use policy=default to handle RFC compliant emails
+            msg = Parser(policy=policy.default).parse(fp)
+
+        # Initialize result with email subject as title
+        result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
+
+        # Build markdown content
+        md_parts = []
+
+        # Add email headers
+        md_parts.append("## Email Headers\n")
+
+        # From and To in a more readable format
+        from_name, from_email = parseaddr(msg.get("from", ""))
+        to_name, to_email = parseaddr(msg.get("to", ""))
+
+        md_parts.append(
+            f"**From:** {from_name} <{from_email}>"
+            if from_name
+            else f"**From:** {from_email}"
+        )
+        md_parts.append(
+            f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
+        )
+        md_parts.append(f"**Subject:** {msg.get('subject', '')}")
+        md_parts.append(f"**Date:** {msg.get('date', '')}")
+
+        # Add CC if present
+        if msg.get("cc"):
+            md_parts.append(f"**CC:** {msg.get('cc')}")
+
+        md_parts.append("\n## Email Content\n")
+
+        # Handle the email body
+        if msg.is_multipart():
+            for part in msg.walk():
+                if part.get_content_type() == "text/plain":
+                    md_parts.append(part.get_content())
+                elif part.get_content_type() == "text/html":
+                    # If we have HTML content but no plain text, we could convert HTML to markdown here
+                    # For now, we'll just note it's HTML content
+                    if not any(
+                        p.get_content_type() == "text/plain" for p in msg.walk()
+                    ):
+                        md_parts.append(part.get_content())
+        else:
+            md_parts.append(msg.get_content())
+
+        # List attachments if any
+        attachments = []
+        if msg.is_multipart():
+            for part in msg.walk():
+                if part.get_content_disposition() == "attachment":
+                    filename = part.get_filename()
+                    if filename:
+                        size = len(part.get_content())
+                        mime_type = part.get_content_type()
+                        attachments.append(
+                            f"- {filename} ({mime_type}, {size:,} bytes)"
+                        )
+
+        if attachments:
+            md_parts.append("\n## Attachments\n")
+            md_parts.extend(attachments)
+
+        # Combine all parts
+        result.text_content = "\n".join(md_parts)
+
+        return result
\ No newline at end of file

From 027fc2a4dc3f1f822647120b83b52155c35d415a Mon Sep 17 00:00:00 2001
From: Luis Lopes <git@luislopes.org>
Date: Fri, 28 Nov 2025 19:22:24 +0100
Subject: [PATCH 2/4] add missing imports

---
 packages/markitdown/src/markitdown/converters/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index e4437a582..650ad9df2 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -17,6 +17,7 @@
 from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
+from ._eml_converter import EmlConverter
 from ._doc_intel_converter import (
     DocumentIntelligenceConverter,
     DocumentIntelligenceFileType,
@@ -45,4 +46,5 @@
     "DocumentIntelligenceFileType",
     "EpubConverter",
     "CsvConverter",
+    "EmlConverter",
 ]

From 5e8cf881c16262c3d4bb7c3698a7cc66c9f62114 Mon Sep 17 00:00:00 2001
From: Luis Lopes <git@luislopes.org>
Date: Fri, 28 Nov 2025 19:27:13 +0100
Subject: [PATCH 3/4] fix path for base converter

---
 packages/markitdown/src/markitdown/converters/_eml_converter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py
index b648990f4..da03724e5 100644
--- a/packages/markitdown/src/markitdown/converters/_eml_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py
@@ -1,5 +1,5 @@
 from typing import Any
-from ._base import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from email import policy
 from email.parser import Parser
 from email.utils import parseaddr

From c6c20ddbf349849a35817f4de0ca851da7e5bb94 Mon Sep 17 00:00:00 2001
From: Luis Lopes <git@luislopes.org>
Date: Fri, 28 Nov 2025 19:49:45 +0100
Subject: [PATCH 4/4] Update EmlConverter to latest standard

---
 .../markitdown/converters/_eml_converter.py   | 176 ++++++++++++------
 packages/markitdown/tests/test_files/test.eml |  33 ++++
 2 files changed, 151 insertions(+), 58 deletions(-)
 create mode 100644 packages/markitdown/tests/test_files/test.eml

diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py
index da03724e5..e1dcfe2e8 100644
--- a/packages/markitdown/src/markitdown/converters/_eml_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py
@@ -1,82 +1,140 @@
-from typing import Any
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from typing import Any, BinaryIO, List, Tuple
+
 from email import policy
-from email.parser import Parser
-from email.utils import parseaddr
+from email.parser import BytesParser
+from email.utils import getaddresses
+
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "message/",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+    ".eml",
+]
+
 
 class EmlConverter(DocumentConverter):
     """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
 
-    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
-        """Convert an EML file to markdown.
-        Args:
-            local_path: Path to the EML file
-            **kwargs: Additional arguments (unused)
-        Returns:
-            DocumentConverterResult containing the converted markdown
-        """
-        # Check if this is an EML file
-        file_ext = kwargs.get("file_extension", "").lower()
-        if not file_ext.endswith(".eml"):
-            return None
-
-        with open(local_path, "r", encoding="utf-8") as fp:
-            # Use policy=default to handle RFC compliant emails
-            msg = Parser(policy=policy.default).parse(fp)
-
-        # Initialize result with email subject as title
-        result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
+    def accepts(
+        self,
+        file_stream: BinaryIO,  # noqa: ARG002 - required by interface
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        # Check the extension and mimetype
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,  # noqa: ARG002 - kept for interface compatibility
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        """Convert an EML message to markdown."""
+        _ = kwargs  # Currently unused
+
+        # Read the full message from the binary stream and parse it
+        raw_bytes = file_stream.read()
+        msg = BytesParser(policy=policy.default).parsebytes(raw_bytes)
 
         # Build markdown content
-        md_parts = []
+        md_parts: List[str] = []
 
         # Add email headers
         md_parts.append("## Email Headers\n")
 
-        # From and To in a more readable format
-        from_name, from_email = parseaddr(msg.get("from", ""))
-        to_name, to_email = parseaddr(msg.get("to", ""))
-
-        md_parts.append(
-            f"**From:** {from_name} <{from_email}>"
-            if from_name
-            else f"**From:** {from_email}"
-        )
-        md_parts.append(
-            f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
-        )
-        md_parts.append(f"**Subject:** {msg.get('subject', '')}")
-        md_parts.append(f"**Date:** {msg.get('date', '')}")
-
-        # Add CC if present
-        if msg.get("cc"):
-            md_parts.append(f"**CC:** {msg.get('cc')}")
+        # Helper to format address headers that can contain multiple addresses
+        def _format_address_header(header_name: str) -> Tuple[str, str]:
+            raw_values = msg.get_all(header_name, [])
+            if not raw_values:
+                return header_name, ""
+
+            addresses = getaddresses(raw_values)
+            formatted = []
+            for name, addr in addresses:
+                if name and addr:
+                    formatted.append(f"{name} <{addr}>")
+                elif addr:
+                    formatted.append(addr)
+            return header_name, ", ".join(formatted)
+
+        # From, To, Cc, Bcc in a readable format
+        for header in ["From", "To", "Cc", "Bcc"]:
+            key, value = _format_address_header(header)
+            if value:
+                md_parts.append(f"**{key}:** {value}")
+
+        # Other common headers
+        subject = msg.get("Subject", "")
+        if subject:
+            md_parts.append(f"**Subject:** {subject}")
+
+        date = msg.get("Date", "")
+        if date:
+            md_parts.append(f"**Date:** {date}")
 
         md_parts.append("\n## Email Content\n")
 
-        # Handle the email body
+        # Prefer plain text body; fall back to HTML if no plain text part exists
+        body_text: List[str] = []
+        has_text_plain = False
+
         if msg.is_multipart():
+            # First pass: check if there is any text/plain part
             for part in msg.walk():
                 if part.get_content_type() == "text/plain":
-                    md_parts.append(part.get_content())
-                elif part.get_content_type() == "text/html":
-                    # If we have HTML content but no plain text, we could convert HTML to markdown here
-                    # For now, we'll just note it's HTML content
-                    if not any(
-                        p.get_content_type() == "text/plain" for p in msg.walk()
-                    ):
-                        md_parts.append(part.get_content())
+                    has_text_plain = True
+                    break
+
+            for part in msg.walk():
+                content_type = part.get_content_type()
+                disposition = part.get_content_disposition()
+
+                # Skip attachments when extracting the main body
+                if disposition == "attachment":
+                    continue
+
+                if content_type == "text/plain":
+                    body_text.append(part.get_content())
+                elif content_type == "text/html" and not has_text_plain:
+                    # If we have HTML content but no plain text, fall back to HTML
+                    body_text.append(part.get_content())
         else:
-            md_parts.append(msg.get_content())
+            # Single-part message
+            content_type = msg.get_content_type()
+            if content_type in ("text/plain", "text/html", "text/rfc822-headers"):
+                body_text.append(msg.get_content())
+
+        if body_text:
+            md_parts.append("\n".join(body_text))
 
-        # List attachments if any
-        attachments = []
+        # List attachments, if any
+        attachments: List[str] = []
         if msg.is_multipart():
             for part in msg.walk():
                 if part.get_content_disposition() == "attachment":
                     filename = part.get_filename()
                     if filename:
-                        size = len(part.get_content())
+                        try:
+                            payload = part.get_content()
+                            size = len(payload) if isinstance(payload, (bytes, str)) else 0
+                        except Exception:
+                            size = 0
                         mime_type = part.get_content_type()
                         attachments.append(
                             f"- {filename} ({mime_type}, {size:,} bytes)"
@@ -86,7 +144,9 @@ def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
             md_parts.append("\n## Attachments\n")
             md_parts.extend(attachments)
 
-        # Combine all parts
-        result.text_content = "\n".join(md_parts)
+        markdown = "\n".join(md_parts).strip()
 
-        return result
\ No newline at end of file
+        return DocumentConverterResult(
+            markdown=markdown,
+            title=subject or None,
+        )
\ No newline at end of file
diff --git a/packages/markitdown/tests/test_files/test.eml b/packages/markitdown/tests/test_files/test.eml
new file mode 100644
index 000000000..15f6b85cf
--- /dev/null
+++ b/packages/markitdown/tests/test_files/test.eml
@@ -0,0 +1,33 @@
+Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
+MIME-Version: 1.0
+Subject: Test Email Document
+From: John Doe <john.doe@example.com>
+To: Jane Smith <jane.smith@example.com>
+Date: Wed, 18 Dec 2024 10:00:00 +0000
+CC: cc.person@example.com
+
+--===============8484938434343225034==
+Content-Type: text/plain; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+
+
+This is a test email with multiple parts.
+
+It contains:
+- Plain text content
+- An attachment
+- Various headers
+
+Best regards,
+John Doe
+
+--===============8484938434343225034==
+Content-Type: application/txt
+MIME-Version: 1.0
+Content-Transfer-Encoding: base64
+Content-Disposition: attachment; filename="test.txt"
+
+VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
+
+--===============8484938434343225034==--