From e498854c3b6c51b6a567acfc4bff096eebba78ca Mon Sep 17 00:00:00 2001
From: makermotion <22776403+makermotion@users.noreply.github.com>
Date: Sun, 22 Dec 2024 00:37:12 +0300
Subject: [PATCH] feat: outlook .msg converter

---
 pyproject.toml                | 19 +++------
 src/markitdown/_markitdown.py | 76 +++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3e14cec..741207d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,9 +10,7 @@ readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
 keywords = []
-authors = [
-  { name = "Adam Fourney", email = "adamfo@microsoft.com" },
-]
+authors = [{ name = "Adam Fourney", email = "adamfo@microsoft.com" }]
 classifiers = [
   "Development Status :: 4 - Beta",
   "Programming Language :: Python",
@@ -35,6 +33,7 @@ dependencies = [
   "pdfminer.six",
   "puremagic",
   "pydub",
+  "olefile",
   "youtube-transcript-api",
   "SpeechRecognition",
   "pathvalidate",
@@ -54,9 +53,7 @@ path = "src/markitdown/__about__.py"
 markitdown = "markitdown.__main__:main"
 
 [tool.hatch.envs.types]
-extra-dependencies = [
-  "mypy>=1.0.0",
-]
+extra-dependencies = ["mypy>=1.0.0"]
 [tool.hatch.envs.types.scripts]
 check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
 
@@ -64,20 +61,14 @@ check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
 source_pkgs = ["markitdown", "tests"]
 branch = true
 parallel = true
-omit = [
-  "src/markitdown/__about__.py",
-]
+omit = ["src/markitdown/__about__.py"]
 
 [tool.coverage.paths]
 markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
 tests = ["tests", "*/markitdown/tests"]
 
 [tool.coverage.report]
-exclude_lines = [
-  "no cov",
-  "if __name__ == .__main__.:",
-  "if TYPE_CHECKING:",
-]
+exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
 
 [tool.hatch.build.targets.sdist]
 only-include = ["src/markitdown"]
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 789c1e5..32d5ba2 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings
 
 import mammoth
 import markdownify
+import olefile
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
@@ -1076,6 +1077,80 @@ class ImageConverter(MediaConverter):
         return response.choices[0].message.content
 
 
+class OutlookMsgConverter(DocumentConverter):
+    """Converts Outlook .msg files to markdown by extracting email metadata and content.
+
+    Uses the olefile package to parse the .msg file structure and extract:
+    - Email headers (From, To, Subject, Date)
+    - Email body content
+    - Attachments (listed but not converted)
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MSG file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".msg":
+            return None
+
+        try:
+            msg = olefile.OleFileIO(local_path)
+            # Extract email metadata
+            md_content = "# Email Message\n\n"
+
+            # Get headers
+            headers = {
+                "From": self._get_stream_data(msg, "__substg1.0_0C1A001F"),
+                "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
+                "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
+            }
+
+            # Add headers to markdown
+            for key, value in headers.items():
+                if value:
+                    md_content += f"**{key}:** {value}\n"
+
+            md_content += "\n## Content\n\n"
+
+            # Get email body
+            body = self._get_stream_data(msg, "__substg1.0_1000001F")
+            if body:
+                md_content += body
+
+            msg.close()
+
+            return DocumentConverterResult(
+                title=headers.get("Subject"), text_content=md_content.strip()
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Could not convert MSG file '{local_path}': {str(e)}"
+            )
+
+    def _get_stream_data(
+        self, msg: olefile.OleFileIO, stream_path: str
+    ) -> Union[str, None]:
+        """Helper to safely extract and decode stream data from the MSG file."""
+        try:
+            if msg.exists(stream_path):
+                data = msg.openstream(stream_path).read()
+                # Try UTF-16 first (common for .msg files)
+                try:
+                    return data.decode("utf-16-le").strip()
+                except UnicodeDecodeError:
+                    # Fall back to UTF-8
+                    try:
+                        return data.decode("utf-8").strip()
+                    except UnicodeDecodeError:
+                        # Last resort - ignore errors
+                        return data.decode("utf-8", errors="ignore").strip()
+        except Exception:
+            pass
+        return None
+
+
 class ZipConverter(DocumentConverter):
     """Converts ZIP files to markdown by extracting and converting all contained files.
 
@@ -1285,6 +1360,7 @@ class MarkItDown:
         self.register_page_converter(IpynbConverter())
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
+        self.register_page_converter(OutlookMsgConverter())
 
     def convert(
         self, source: Union[str, requests.Response, Path], **kwargs: Any