feat: outlook .msg converter

This commit is contained in:
makermotion 2024-12-22 00:37:12 +03:00
parent 125e206047
commit e498854c3b
2 changed files with 81 additions and 14 deletions

View file

@ -10,9 +10,7 @@ readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
license = "MIT" license = "MIT"
keywords = [] keywords = []
authors = [ authors = [{ name = "Adam Fourney", email = "adamfo@microsoft.com" }]
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
]
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",
"Programming Language :: Python", "Programming Language :: Python",
@ -35,6 +33,7 @@ dependencies = [
"pdfminer.six", "pdfminer.six",
"puremagic", "puremagic",
"pydub", "pydub",
"olefile",
"youtube-transcript-api", "youtube-transcript-api",
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
@ -54,9 +53,7 @@ path = "src/markitdown/__about__.py"
markitdown = "markitdown.__main__:main" markitdown = "markitdown.__main__:main"
[tool.hatch.envs.types] [tool.hatch.envs.types]
extra-dependencies = [ extra-dependencies = ["mypy>=1.0.0"]
"mypy>=1.0.0",
]
[tool.hatch.envs.types.scripts] [tool.hatch.envs.types.scripts]
check = "mypy --install-types --non-interactive {args:src/markitdown tests}" check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
@ -64,20 +61,14 @@ check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
source_pkgs = ["markitdown", "tests"] source_pkgs = ["markitdown", "tests"]
branch = true branch = true
parallel = true parallel = true
omit = [ omit = ["src/markitdown/__about__.py"]
"src/markitdown/__about__.py",
]
[tool.coverage.paths] [tool.coverage.paths]
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"] markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
tests = ["tests", "*/markitdown/tests"] tests = ["tests", "*/markitdown/tests"]
[tool.coverage.report] [tool.coverage.report]
exclude_lines = [ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.hatch.build.targets.sdist] [tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"] only-include = ["src/markitdown"]

View file

@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings
import mammoth import mammoth
import markdownify import markdownify
import olefile
import pandas as pd import pandas as pd
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
@ -1076,6 +1077,80 @@ class ImageConverter(MediaConverter):
return response.choices[0].message.content return response.choices[0].message.content
class OutlookMsgConverter(DocumentConverter):
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
Uses the olefile package to parse the .msg file structure and extract:
- Email headers (From, To, Subject, Date)
- Email body content
- Attachments (listed but not converted)
"""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a MSG file
extension = kwargs.get("file_extension", "")
if extension.lower() != ".msg":
return None
try:
msg = olefile.OleFileIO(local_path)
# Extract email metadata
md_content = "# Email Message\n\n"
# Get headers
headers = {
"From": self._get_stream_data(msg, "__substg1.0_0C1A001F"),
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
}
# Add headers to markdown
for key, value in headers.items():
if value:
md_content += f"**{key}:** {value}\n"
md_content += "\n## Content\n\n"
# Get email body
body = self._get_stream_data(msg, "__substg1.0_1000001F")
if body:
md_content += body
msg.close()
return DocumentConverterResult(
title=headers.get("Subject"), text_content=md_content.strip()
)
except Exception as e:
raise FileConversionException(
f"Could not convert MSG file '{local_path}': {str(e)}"
)
def _get_stream_data(
self, msg: olefile.OleFileIO, stream_path: str
) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file."""
try:
if msg.exists(stream_path):
data = msg.openstream(stream_path).read()
# Try UTF-16 first (common for .msg files)
try:
return data.decode("utf-16-le").strip()
except UnicodeDecodeError:
# Fall back to UTF-8
try:
return data.decode("utf-8").strip()
except UnicodeDecodeError:
# Last resort - ignore errors
return data.decode("utf-8", errors="ignore").strip()
except Exception:
pass
return None
class ZipConverter(DocumentConverter): class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files. """Converts ZIP files to markdown by extracting and converting all contained files.
@ -1285,6 +1360,7 @@ class MarkItDown:
self.register_page_converter(IpynbConverter()) self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
self.register_page_converter(OutlookMsgConverter())
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any