diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 32d5ba2..d72196b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1083,7 +1083,6 @@ class OutlookMsgConverter(DocumentConverter): Uses the olefile package to parse the .msg file structure and extract: - Email headers (From, To, Subject, Date) - Email body content - - Attachments (listed but not converted) """ def convert( @@ -1101,7 +1100,7 @@ class OutlookMsgConverter(DocumentConverter): # Get headers headers = { - "From": self._get_stream_data(msg, "__substg1.0_0C1A001F"), + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), } diff --git a/tests/test_files/test_outlook_msg.msg b/tests/test_files/test_outlook_msg.msg new file mode 100644 index 0000000..05b087b Binary files /dev/null and b/tests/test_files/test_outlook_msg.msg differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..a0626d1 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -63,6 +63,15 @@ DOCX_TEST_STRINGS = [ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", ] +MSG_TEST_STRINGS = [ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", +] + DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -232,6 +241,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test MSG (Outlook email) processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) + validate_strings(result, MSG_TEST_STRINGS) + @pytest.mark.skipif( skip_exiftool,