add support for EML
This commit is contained in:
parent
1deaba1c6c
commit
68cc8aa672
3 changed files with 148 additions and 0 deletions
|
|
@ -17,6 +17,9 @@ from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import warn, resetwarnings, catch_warnings
|
||||||
|
from email import policy
|
||||||
|
from email.parser import Parser
|
||||||
|
from email.utils import parseaddr
|
||||||
|
|
||||||
import mammoth
|
import mammoth
|
||||||
import markdownify
|
import markdownify
|
||||||
|
|
@ -1075,6 +1078,96 @@ class ImageConverter(MediaConverter):
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
|
class EmlConverter(DocumentConverter):
|
||||||
|
"""Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
|
||||||
|
|
||||||
|
def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
|
||||||
|
"""Convert an EML file to markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_path: Path to the EML file
|
||||||
|
**kwargs: Additional arguments (unused)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentConverterResult containing the converted markdown
|
||||||
|
"""
|
||||||
|
# Check if this is an EML file
|
||||||
|
file_ext = kwargs.get("file_extension", "").lower()
|
||||||
|
if not file_ext.endswith(".eml"):
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(local_path, "r", encoding="utf-8") as fp:
|
||||||
|
# Use policy=default to handle RFC compliant emails
|
||||||
|
msg = Parser(policy=policy.default).parse(fp)
|
||||||
|
|
||||||
|
# Initialize result with email subject as title
|
||||||
|
result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
|
||||||
|
|
||||||
|
# Build markdown content
|
||||||
|
md_parts = []
|
||||||
|
|
||||||
|
# Add email headers
|
||||||
|
md_parts.append("## Email Headers\n")
|
||||||
|
|
||||||
|
# From and To in a more readable format
|
||||||
|
from_name, from_email = parseaddr(msg.get("from", ""))
|
||||||
|
to_name, to_email = parseaddr(msg.get("to", ""))
|
||||||
|
|
||||||
|
md_parts.append(
|
||||||
|
f"**From:** {from_name} <{from_email}>"
|
||||||
|
if from_name
|
||||||
|
else f"**From:** {from_email}"
|
||||||
|
)
|
||||||
|
md_parts.append(
|
||||||
|
f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
|
||||||
|
)
|
||||||
|
md_parts.append(f"**Subject:** {msg.get('subject', '')}")
|
||||||
|
md_parts.append(f"**Date:** {msg.get('date', '')}")
|
||||||
|
|
||||||
|
# Add CC if present
|
||||||
|
if msg.get("cc"):
|
||||||
|
md_parts.append(f"**CC:** {msg.get('cc')}")
|
||||||
|
|
||||||
|
md_parts.append("\n## Email Content\n")
|
||||||
|
|
||||||
|
# Handle the email body
|
||||||
|
if msg.is_multipart():
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.get_content_type() == "text/plain":
|
||||||
|
md_parts.append(part.get_content())
|
||||||
|
elif part.get_content_type() == "text/html":
|
||||||
|
# If we have HTML content but no plain text, we could convert HTML to markdown here
|
||||||
|
# For now, we'll just note it's HTML content
|
||||||
|
if not any(
|
||||||
|
p.get_content_type() == "text/plain" for p in msg.walk()
|
||||||
|
):
|
||||||
|
md_parts.append(part.get_content())
|
||||||
|
else:
|
||||||
|
md_parts.append(msg.get_content())
|
||||||
|
|
||||||
|
# List attachments if any
|
||||||
|
attachments = []
|
||||||
|
if msg.is_multipart():
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.get_content_disposition() == "attachment":
|
||||||
|
filename = part.get_filename()
|
||||||
|
if filename:
|
||||||
|
size = len(part.get_content())
|
||||||
|
mime_type = part.get_content_type()
|
||||||
|
attachments.append(
|
||||||
|
f"- {filename} ({mime_type}, {size:,} bytes)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if attachments:
|
||||||
|
md_parts.append("\n## Attachments\n")
|
||||||
|
md_parts.extend(attachments)
|
||||||
|
|
||||||
|
# Combine all parts
|
||||||
|
result.text_content = "\n".join(md_parts)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||||
|
|
||||||
|
|
@ -1273,6 +1366,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(IpynbConverter())
|
self.register_page_converter(IpynbConverter())
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
self.register_page_converter(EmlConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response], **kwargs: Any
|
||||||
|
|
|
||||||
33
tests/test_files/test.eml
vendored
Normal file
33
tests/test_files/test.eml
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Subject: Test Email Document
|
||||||
|
From: John Doe <john.doe@example.com>
|
||||||
|
To: Jane Smith <jane.smith@example.com>
|
||||||
|
Date: Wed, 18 Dec 2024 10:00:00 +0000
|
||||||
|
CC: cc.person@example.com
|
||||||
|
|
||||||
|
--===============8484938434343225034==
|
||||||
|
Content-Type: text/plain; charset="us-ascii"
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Transfer-Encoding: 7bit
|
||||||
|
|
||||||
|
|
||||||
|
This is a test email with multiple parts.
|
||||||
|
|
||||||
|
It contains:
|
||||||
|
- Plain text content
|
||||||
|
- An attachment
|
||||||
|
- Various headers
|
||||||
|
|
||||||
|
Best regards,
|
||||||
|
John Doe
|
||||||
|
|
||||||
|
--===============8484938434343225034==
|
||||||
|
Content-Type: application/txt
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Transfer-Encoding: base64
|
||||||
|
Content-Disposition: attachment; filename="test.txt"
|
||||||
|
|
||||||
|
VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
|
||||||
|
|
||||||
|
--===============8484938434343225034==--
|
||||||
|
|
@ -126,6 +126,20 @@ CSV_CP932_TEST_STRINGS = [
|
||||||
"髙橋淳,35,名古屋",
|
"髙橋淳,35,名古屋",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
EML_TEST_STRINGS = [
|
||||||
|
"## Email Headers",
|
||||||
|
"**From:** John Doe <john.doe@example.com>",
|
||||||
|
"**To:** Jane Smith <jane.smith@example.com>",
|
||||||
|
"**Subject:** Test Email Document",
|
||||||
|
"**CC:** cc.person@example.com",
|
||||||
|
"## Email Content",
|
||||||
|
"This is a test email with multiple parts",
|
||||||
|
"- Plain text content",
|
||||||
|
"- An attachment",
|
||||||
|
"## Attachments",
|
||||||
|
"- test.txt (application/txt, 31 bytes)",
|
||||||
|
]
|
||||||
|
|
||||||
LLM_TEST_STRINGS = [
|
LLM_TEST_STRINGS = [
|
||||||
"5bda1dd6",
|
"5bda1dd6",
|
||||||
]
|
]
|
||||||
|
|
@ -197,6 +211,13 @@ def test_markitdown_local() -> None:
|
||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
assert test_string in text_content
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test EML processing
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml"))
|
||||||
|
assert result.title == "Test Email Document"
|
||||||
|
for test_string in EML_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
# Test HTML processing
|
# Test HTML processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue