add support for EML
This commit is contained in:
parent
1deaba1c6c
commit
68cc8aa672
3 changed files with 148 additions and 0 deletions
|
|
@ -17,6 +17,9 @@ from xml.dom import minidom
|
|||
from typing import Any, Dict, List, Optional, Union
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
from email import policy
|
||||
from email.parser import Parser
|
||||
from email.utils import parseaddr
|
||||
|
||||
import mammoth
|
||||
import markdownify
|
||||
|
|
@ -1075,6 +1078,96 @@ class ImageConverter(MediaConverter):
|
|||
return response.choices[0].message.content
|
||||
|
||||
|
||||
class EmlConverter(DocumentConverter):
|
||||
"""Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
|
||||
|
||||
def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
|
||||
"""Convert an EML file to markdown.
|
||||
|
||||
Args:
|
||||
local_path: Path to the EML file
|
||||
**kwargs: Additional arguments (unused)
|
||||
|
||||
Returns:
|
||||
DocumentConverterResult containing the converted markdown
|
||||
"""
|
||||
# Check if this is an EML file
|
||||
file_ext = kwargs.get("file_extension", "").lower()
|
||||
if not file_ext.endswith(".eml"):
|
||||
return None
|
||||
|
||||
with open(local_path, "r", encoding="utf-8") as fp:
|
||||
# Use policy=default to handle RFC compliant emails
|
||||
msg = Parser(policy=policy.default).parse(fp)
|
||||
|
||||
# Initialize result with email subject as title
|
||||
result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
|
||||
|
||||
# Build markdown content
|
||||
md_parts = []
|
||||
|
||||
# Add email headers
|
||||
md_parts.append("## Email Headers\n")
|
||||
|
||||
# From and To in a more readable format
|
||||
from_name, from_email = parseaddr(msg.get("from", ""))
|
||||
to_name, to_email = parseaddr(msg.get("to", ""))
|
||||
|
||||
md_parts.append(
|
||||
f"**From:** {from_name} <{from_email}>"
|
||||
if from_name
|
||||
else f"**From:** {from_email}"
|
||||
)
|
||||
md_parts.append(
|
||||
f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
|
||||
)
|
||||
md_parts.append(f"**Subject:** {msg.get('subject', '')}")
|
||||
md_parts.append(f"**Date:** {msg.get('date', '')}")
|
||||
|
||||
# Add CC if present
|
||||
if msg.get("cc"):
|
||||
md_parts.append(f"**CC:** {msg.get('cc')}")
|
||||
|
||||
md_parts.append("\n## Email Content\n")
|
||||
|
||||
# Handle the email body
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
md_parts.append(part.get_content())
|
||||
elif part.get_content_type() == "text/html":
|
||||
# If we have HTML content but no plain text, we could convert HTML to markdown here
|
||||
# For now, we'll just note it's HTML content
|
||||
if not any(
|
||||
p.get_content_type() == "text/plain" for p in msg.walk()
|
||||
):
|
||||
md_parts.append(part.get_content())
|
||||
else:
|
||||
md_parts.append(msg.get_content())
|
||||
|
||||
# List attachments if any
|
||||
attachments = []
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_disposition() == "attachment":
|
||||
filename = part.get_filename()
|
||||
if filename:
|
||||
size = len(part.get_content())
|
||||
mime_type = part.get_content_type()
|
||||
attachments.append(
|
||||
f"- {filename} ({mime_type}, {size:,} bytes)"
|
||||
)
|
||||
|
||||
if attachments:
|
||||
md_parts.append("\n## Attachments\n")
|
||||
md_parts.extend(attachments)
|
||||
|
||||
# Combine all parts
|
||||
result.text_content = "\n".join(md_parts)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||
|
||||
|
|
@ -1273,6 +1366,7 @@ class MarkItDown:
|
|||
self.register_page_converter(IpynbConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(ZipConverter())
|
||||
self.register_page_converter(EmlConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
|
|
|
|||
33
tests/test_files/test.eml
vendored
Normal file
33
tests/test_files/test.eml
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
|
||||
MIME-Version: 1.0
|
||||
Subject: Test Email Document
|
||||
From: John Doe <john.doe@example.com>
|
||||
To: Jane Smith <jane.smith@example.com>
|
||||
Date: Wed, 18 Dec 2024 10:00:00 +0000
|
||||
CC: cc.person@example.com
|
||||
|
||||
--===============8484938434343225034==
|
||||
Content-Type: text/plain; charset="us-ascii"
|
||||
MIME-Version: 1.0
|
||||
Content-Transfer-Encoding: 7bit
|
||||
|
||||
|
||||
This is a test email with multiple parts.
|
||||
|
||||
It contains:
|
||||
- Plain text content
|
||||
- An attachment
|
||||
- Various headers
|
||||
|
||||
Best regards,
|
||||
John Doe
|
||||
|
||||
--===============8484938434343225034==
|
||||
Content-Type: application/txt
|
||||
MIME-Version: 1.0
|
||||
Content-Transfer-Encoding: base64
|
||||
Content-Disposition: attachment; filename="test.txt"
|
||||
|
||||
VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
|
||||
|
||||
--===============8484938434343225034==--
|
||||
|
|
@ -126,6 +126,20 @@ CSV_CP932_TEST_STRINGS = [
|
|||
"髙橋淳,35,名古屋",
|
||||
]
|
||||
|
||||
EML_TEST_STRINGS = [
|
||||
"## Email Headers",
|
||||
"**From:** John Doe <john.doe@example.com>",
|
||||
"**To:** Jane Smith <jane.smith@example.com>",
|
||||
"**Subject:** Test Email Document",
|
||||
"**CC:** cc.person@example.com",
|
||||
"## Email Content",
|
||||
"This is a test email with multiple parts",
|
||||
"- Plain text content",
|
||||
"- An attachment",
|
||||
"## Attachments",
|
||||
"- test.txt (application/txt, 31 bytes)",
|
||||
]
|
||||
|
||||
LLM_TEST_STRINGS = [
|
||||
"5bda1dd6",
|
||||
]
|
||||
|
|
@ -197,6 +211,13 @@ def test_markitdown_local() -> None:
|
|||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test EML processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml"))
|
||||
assert result.title == "Test Email Document"
|
||||
for test_string in EML_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test HTML processing
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
||||
|
|
|
|||
Loading…
Reference in a new issue