This commit is contained in:
Raduan A. 2025-02-10 22:17:07 +01:00 committed by GitHub
commit 00717c4fa6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 148 additions and 0 deletions

View file

@ -18,6 +18,9 @@ from typing import Any, Dict, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
from email import policy
from email.parser import Parser
from email.utils import parseaddr
import mammoth
import markdownify
@ -1258,6 +1261,96 @@ class OutlookMsgConverter(DocumentConverter):
return None
class EmlConverter(DocumentConverter):
"""Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
"""Convert an EML file to markdown.
Args:
local_path: Path to the EML file
**kwargs: Additional arguments (unused)
Returns:
DocumentConverterResult containing the converted markdown
"""
# Check if this is an EML file
file_ext = kwargs.get("file_extension", "").lower()
if not file_ext.endswith(".eml"):
return None
with open(local_path, "r", encoding="utf-8") as fp:
# Use policy=default to handle RFC compliant emails
msg = Parser(policy=policy.default).parse(fp)
# Initialize result with email subject as title
result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
# Build markdown content
md_parts = []
# Add email headers
md_parts.append("## Email Headers\n")
# From and To in a more readable format
from_name, from_email = parseaddr(msg.get("from", ""))
to_name, to_email = parseaddr(msg.get("to", ""))
md_parts.append(
f"**From:** {from_name} <{from_email}>"
if from_name
else f"**From:** {from_email}"
)
md_parts.append(
f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
)
md_parts.append(f"**Subject:** {msg.get('subject', '')}")
md_parts.append(f"**Date:** {msg.get('date', '')}")
# Add CC if present
if msg.get("cc"):
md_parts.append(f"**CC:** {msg.get('cc')}")
md_parts.append("\n## Email Content\n")
# Handle the email body
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
md_parts.append(part.get_content())
elif part.get_content_type() == "text/html":
# If we have HTML content but no plain text, we could convert HTML to markdown here
# For now, we'll just note it's HTML content
if not any(
p.get_content_type() == "text/plain" for p in msg.walk()
):
md_parts.append(part.get_content())
else:
md_parts.append(msg.get_content())
# List attachments if any
attachments = []
if msg.is_multipart():
for part in msg.walk():
if part.get_content_disposition() == "attachment":
filename = part.get_filename()
if filename:
size = len(part.get_content())
mime_type = part.get_content_type()
attachments.append(
f"- {filename} ({mime_type}, {size:,} bytes)"
)
if attachments:
md_parts.append("\n## Attachments\n")
md_parts.extend(attachments)
# Combine all parts
result.text_content = "\n".join(md_parts)
return result
class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
@ -1543,6 +1636,7 @@ class MarkItDown:
self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
self.register_page_converter(OutlookMsgConverter())
self.register_page_converter(EmlConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:

33
tests/test_files/test.eml vendored Normal file
View file

@ -0,0 +1,33 @@
Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
MIME-Version: 1.0
Subject: Test Email Document
From: John Doe <john.doe@example.com>
To: Jane Smith <jane.smith@example.com>
Date: Wed, 18 Dec 2024 10:00:00 +0000
CC: cc.person@example.com
--===============8484938434343225034==
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
This is a test email with multiple parts.
It contains:
- Plain text content
- An attachment
- Various headers
Best regards,
John Doe
--===============8484938434343225034==
Content-Type: application/txt
MIME-Version: 1.0
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="test.txt"
VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
--===============8484938434343225034==--

View file

@ -141,6 +141,20 @@ CSV_CP932_TEST_STRINGS = [
"髙橋淳,35,名古屋",
]
EML_TEST_STRINGS = [
"## Email Headers",
"**From:** John Doe <john.doe@example.com>",
"**To:** Jane Smith <jane.smith@example.com>",
"**Subject:** Test Email Document",
"**CC:** cc.person@example.com",
"## Email Content",
"This is a test email with multiple parts",
"- Plain text content",
"- An attachment",
"## Attachments",
"- test.txt (application/txt, 31 bytes)",
]
LLM_TEST_STRINGS = [
"5bda1dd6",
]
@ -224,6 +238,13 @@ def test_markitdown_local() -> None:
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
validate_strings(result, PPTX_TEST_STRINGS)
# Test EML processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml"))
assert result.title == "Test Email Document"
for test_string in EML_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test HTML processing
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL