feat: outlook .msg converter
This commit is contained in:
parent
125e206047
commit
e498854c3b
2 changed files with 81 additions and 14 deletions
|
|
@ -10,9 +10,7 @@ readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
keywords = []
|
keywords = []
|
||||||
authors = [
|
authors = [{ name = "Adam Fourney", email = "adamfo@microsoft.com" }]
|
||||||
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
|
|
||||||
]
|
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
|
|
@ -35,6 +33,7 @@ dependencies = [
|
||||||
"pdfminer.six",
|
"pdfminer.six",
|
||||||
"puremagic",
|
"puremagic",
|
||||||
"pydub",
|
"pydub",
|
||||||
|
"olefile",
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"pathvalidate",
|
"pathvalidate",
|
||||||
|
|
@ -54,9 +53,7 @@ path = "src/markitdown/__about__.py"
|
||||||
markitdown = "markitdown.__main__:main"
|
markitdown = "markitdown.__main__:main"
|
||||||
|
|
||||||
[tool.hatch.envs.types]
|
[tool.hatch.envs.types]
|
||||||
extra-dependencies = [
|
extra-dependencies = ["mypy>=1.0.0"]
|
||||||
"mypy>=1.0.0",
|
|
||||||
]
|
|
||||||
[tool.hatch.envs.types.scripts]
|
[tool.hatch.envs.types.scripts]
|
||||||
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
||||||
|
|
||||||
|
|
@ -64,20 +61,14 @@ check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
||||||
source_pkgs = ["markitdown", "tests"]
|
source_pkgs = ["markitdown", "tests"]
|
||||||
branch = true
|
branch = true
|
||||||
parallel = true
|
parallel = true
|
||||||
omit = [
|
omit = ["src/markitdown/__about__.py"]
|
||||||
"src/markitdown/__about__.py",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.coverage.paths]
|
[tool.coverage.paths]
|
||||||
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
|
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
|
||||||
tests = ["tests", "*/markitdown/tests"]
|
tests = ["tests", "*/markitdown/tests"]
|
||||||
|
|
||||||
[tool.coverage.report]
|
[tool.coverage.report]
|
||||||
exclude_lines = [
|
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
|
||||||
"no cov",
|
|
||||||
"if __name__ == .__main__.:",
|
|
||||||
"if TYPE_CHECKING:",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.hatch.build.targets.sdist]
|
[tool.hatch.build.targets.sdist]
|
||||||
only-include = ["src/markitdown"]
|
only-include = ["src/markitdown"]
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings
|
||||||
|
|
||||||
import mammoth
|
import mammoth
|
||||||
import markdownify
|
import markdownify
|
||||||
|
import olefile
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
|
|
@ -1076,6 +1077,80 @@ class ImageConverter(MediaConverter):
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
|
class OutlookMsgConverter(DocumentConverter):
|
||||||
|
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||||
|
|
||||||
|
Uses the olefile package to parse the .msg file structure and extract:
|
||||||
|
- Email headers (From, To, Subject, Date)
|
||||||
|
- Email body content
|
||||||
|
- Attachments (listed but not converted)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not a MSG file
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() != ".msg":
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
msg = olefile.OleFileIO(local_path)
|
||||||
|
# Extract email metadata
|
||||||
|
md_content = "# Email Message\n\n"
|
||||||
|
|
||||||
|
# Get headers
|
||||||
|
headers = {
|
||||||
|
"From": self._get_stream_data(msg, "__substg1.0_0C1A001F"),
|
||||||
|
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||||
|
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add headers to markdown
|
||||||
|
for key, value in headers.items():
|
||||||
|
if value:
|
||||||
|
md_content += f"**{key}:** {value}\n"
|
||||||
|
|
||||||
|
md_content += "\n## Content\n\n"
|
||||||
|
|
||||||
|
# Get email body
|
||||||
|
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||||
|
if body:
|
||||||
|
md_content += body
|
||||||
|
|
||||||
|
msg.close()
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=headers.get("Subject"), text_content=md_content.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise FileConversionException(
|
||||||
|
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_stream_data(
|
||||||
|
self, msg: olefile.OleFileIO, stream_path: str
|
||||||
|
) -> Union[str, None]:
|
||||||
|
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||||
|
try:
|
||||||
|
if msg.exists(stream_path):
|
||||||
|
data = msg.openstream(stream_path).read()
|
||||||
|
# Try UTF-16 first (common for .msg files)
|
||||||
|
try:
|
||||||
|
return data.decode("utf-16-le").strip()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Fall back to UTF-8
|
||||||
|
try:
|
||||||
|
return data.decode("utf-8").strip()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Last resort - ignore errors
|
||||||
|
return data.decode("utf-8", errors="ignore").strip()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||||
|
|
||||||
|
|
@ -1285,6 +1360,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(IpynbConverter())
|
self.register_page_converter(IpynbConverter())
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
self.register_page_converter(OutlookMsgConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue