From e498854c3b6c51b6a567acfc4bff096eebba78ca Mon Sep 17 00:00:00 2001 From: makermotion <22776403+makermotion@users.noreply.github.com> Date: Sun, 22 Dec 2024 00:37:12 +0300 Subject: [PATCH] feat: outlook .msg converter --- pyproject.toml | 19 +++------ src/markitdown/_markitdown.py | 76 +++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..741207d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,9 +10,7 @@ readme = "README.md" requires-python = ">=3.10" license = "MIT" keywords = [] -authors = [ - { name = "Adam Fourney", email = "adamfo@microsoft.com" }, -] +authors = [{ name = "Adam Fourney", email = "adamfo@microsoft.com" }] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", @@ -35,6 +33,7 @@ dependencies = [ "pdfminer.six", "puremagic", "pydub", + "olefile", "youtube-transcript-api", "SpeechRecognition", "pathvalidate", @@ -54,9 +53,7 @@ path = "src/markitdown/__about__.py" markitdown = "markitdown.__main__:main" [tool.hatch.envs.types] -extra-dependencies = [ - "mypy>=1.0.0", -] +extra-dependencies = ["mypy>=1.0.0"] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive {args:src/markitdown tests}" @@ -64,20 +61,14 @@ check = "mypy --install-types --non-interactive {args:src/markitdown tests}" source_pkgs = ["markitdown", "tests"] branch = true parallel = true -omit = [ - "src/markitdown/__about__.py", -] +omit = ["src/markitdown/__about__.py"] [tool.coverage.paths] markitdown = ["src/markitdown", "*/markitdown/src/markitdown"] tests = ["tests", "*/markitdown/tests"] [tool.coverage.report] -exclude_lines = [ - "no cov", - "if __name__ == .__main__.:", - "if TYPE_CHECKING:", -] +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] [tool.hatch.build.targets.sdist] only-include = ["src/markitdown"] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..32d5ba2 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings import mammoth import markdownify +import olefile import pandas as pd import pdfminer import pdfminer.high_level @@ -1076,6 +1077,80 @@ class ImageConverter(MediaConverter): return response.choices[0].message.content +class OutlookMsgConverter(DocumentConverter): + """Converts Outlook .msg files to markdown by extracting email metadata and content. + + Uses the olefile package to parse the .msg file structure and extract: + - Email headers (From, To, Subject, Date) + - Email body content + - Attachments (listed but not converted) + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a MSG file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".msg": + return None + + try: + msg = olefile.OleFileIO(local_path) + # Extract email metadata + md_content = "# Email Message\n\n" + + # Get headers + headers = { + "From": self._get_stream_data(msg, "__substg1.0_0C1A001F"), + "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), + "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), + } + + # Add headers to markdown + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + # Get email body + body = self._get_stream_data(msg, "__substg1.0_1000001F") + if body: + md_content += body + + msg.close() + + return DocumentConverterResult( + title=headers.get("Subject"), text_content=md_content.strip() + ) + + except Exception as e: + raise FileConversionException( + f"Could not convert MSG file '{local_path}': {str(e)}" + ) + + def _get_stream_data( + self, msg: olefile.OleFileIO, stream_path: str + ) -> Union[str, None]: + """Helper to safely extract and decode stream data from the MSG file.""" + try: + if msg.exists(stream_path): + data = msg.openstream(stream_path).read() + # Try UTF-16 first (common for .msg files) + try: + return data.decode("utf-16-le").strip() + except UnicodeDecodeError: + # Fall back to UTF-8 + try: + return data.decode("utf-8").strip() + except UnicodeDecodeError: + # Last resort - ignore errors + return data.decode("utf-8", errors="ignore").strip() + except Exception: + pass + return None + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1285,6 +1360,7 @@ class MarkItDown: self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(OutlookMsgConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any