add support for EML

2025-01-09 11:14:50 +01:00 · 2025-01-09 11:14:50 +01:00 · 68cc8aa672
commit 68cc8aa672
parent 1deaba1c6c
3 changed files with 148 additions and 0 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -17,6 +17,9 @@ from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
 from email import policy
 from email.parser import Parser
 from email.utils import parseaddr
 import mammoth
 import markdownify
@ -1075,6 +1078,96 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content
 class EmlConverter(DocumentConverter):
    """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
        """Convert an EML file to markdown.
        Args:
            local_path: Path to the EML file
            **kwargs: Additional arguments (unused)
        Returns:
            DocumentConverterResult containing the converted markdown
        """
        # Check if this is an EML file
        file_ext = kwargs.get("file_extension", "").lower()
        if not file_ext.endswith(".eml"):
            return None
        with open(local_path, "r", encoding="utf-8") as fp:
            # Use policy=default to handle RFC compliant emails
            msg = Parser(policy=policy.default).parse(fp)
        # Initialize result with email subject as title
        result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
        # Build markdown content
        md_parts = []
        # Add email headers
        md_parts.append("## Email Headers\n")
        # From and To in a more readable format
        from_name, from_email = parseaddr(msg.get("from", ""))
        to_name, to_email = parseaddr(msg.get("to", ""))
        md_parts.append(
            f"**From:** {from_name} <{from_email}>"
            if from_name
            else f"**From:** {from_email}"
        )
        md_parts.append(
            f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
        )
        md_parts.append(f"**Subject:** {msg.get('subject', '')}")
        md_parts.append(f"**Date:** {msg.get('date', '')}")
        # Add CC if present
        if msg.get("cc"):
            md_parts.append(f"**CC:** {msg.get('cc')}")
        md_parts.append("\n## Email Content\n")
        # Handle the email body
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    md_parts.append(part.get_content())
                elif part.get_content_type() == "text/html":
                    # If we have HTML content but no plain text, we could convert HTML to markdown here
                    # For now, we'll just note it's HTML content
                    if not any(
                        p.get_content_type() == "text/plain" for p in msg.walk()
                    ):
                        md_parts.append(part.get_content())
        else:
            md_parts.append(msg.get_content())
        # List attachments if any
        attachments = []
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_disposition() == "attachment":
                    filename = part.get_filename()
                    if filename:
                        size = len(part.get_content())
                        mime_type = part.get_content_type()
                        attachments.append(
                            f"- {filename} ({mime_type}, {size:,} bytes)"
                        )
        if attachments:
            md_parts.append("\n## Attachments\n")
            md_parts.extend(attachments)
        # Combine all parts
        result.text_content = "\n".join(md_parts)
        return result
 class ZipConverter(DocumentConverter):
    """Converts ZIP files to markdown by extracting and converting all contained files.
@ -1273,6 +1366,7 @@ class MarkItDown:
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
        self.register_page_converter(EmlConverter())
    def convert(
        self, source: Union[str, requests.Response], **kwargs: Any
--- a/tests/test_files/test.eml
+++ b/tests/test_files/test.eml
@ -0,0 +1,33 @@
 Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
 MIME-Version: 1.0
 Subject: Test Email Document
 From: John Doe <john.doe@example.com>
 To: Jane Smith <jane.smith@example.com>
 Date: Wed, 18 Dec 2024 10:00:00 +0000
 CC: cc.person@example.com
 --===============8484938434343225034==
 Content-Type: text/plain; charset="us-ascii"
 MIME-Version: 1.0
 Content-Transfer-Encoding: 7bit
 This is a test email with multiple parts.
 It contains:
 - Plain text content
 - An attachment
 - Various headers
 Best regards,
 John Doe
 --===============8484938434343225034==
 Content-Type: application/txt
 MIME-Version: 1.0
 Content-Transfer-Encoding: base64
 Content-Disposition: attachment; filename="test.txt"
 VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
 --===============8484938434343225034==--
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -126,6 +126,20 @@ CSV_CP932_TEST_STRINGS = [
    "髙橋淳,35,名古屋",
 ]
 EML_TEST_STRINGS = [
    "## Email Headers",
    "**From:** John Doe <john.doe@example.com>",
    "**To:** Jane Smith <jane.smith@example.com>",
    "**Subject:** Test Email Document",
    "**CC:** cc.person@example.com",
    "## Email Content",
    "This is a test email with multiple parts",
    "- Plain text content",
    "- An attachment",
    "## Attachments",
    "- test.txt (application/txt, 31 bytes)",
 ]
 LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]
@ -197,6 +211,13 @@ def test_markitdown_local() -> None:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test EML processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml"))
    assert result.title == "Test Email Document"
    for test_string in EML_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test HTML processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL