From 3207e3ab380c21f63d9fd970de41c348c65a5b7a Mon Sep 17 00:00:00 2001 From: makermotion <22776403+makermotion@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:30:53 +0300 Subject: [PATCH] add test, adjust docstring --- src/markitdown/_markitdown.py | 3 +-- tests/test_files/test_outlook_msg.msg | Bin 0 -> 13312 bytes tests/test_markitdown.py | 13 +++++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 tests/test_files/test_outlook_msg.msg diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 32d5ba2..d72196b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1083,7 +1083,6 @@ class OutlookMsgConverter(DocumentConverter): Uses the olefile package to parse the .msg file structure and extract: - Email headers (From, To, Subject, Date) - Email body content - - Attachments (listed but not converted) """ def convert( @@ -1101,7 +1100,7 @@ class OutlookMsgConverter(DocumentConverter): # Get headers headers = { - "From": self._get_stream_data(msg, "__substg1.0_0C1A001F"), + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), } diff --git a/tests/test_files/test_outlook_msg.msg b/tests/test_files/test_outlook_msg.msg new file mode 100644 index 0000000000000000000000000000000000000000..05b087b77c785c8b57a479485b9715fb7dfecbb4 GIT binary patch literal 13312 zcmeHN-BVk~6+c3bjg7ILwn=KIj;=}Trgp#*2rveM2nZVkG6mtd9nCNjlCWAKQAxyM zJj|q-$xPdqzW6l{{rCs^(npW~gZ8mAopjpA=DE{8CG_{(s}+Li<6J|AWX6m4-g|b> z?(dwl=j@mK*T1~{&)@y&(!b?y~s1$oPDb zzCjk~blees#sL(WabNo9xsPR^kLX*voQG{cD~qxqeG-$RR3zgSUgBs|MoUMcvLQ*y zNgm$|rnC%ty-lCX3-QHU3oA>L@u|sJ-`vVld}V%RIdXepa(2FN>fS;-fhfBpQ|37* zTT+57TaaN(R(+0)K_-?ZQM!g_0enB-$5oaHWVDj^fvX7Wop!Lb`seGv)*P0aMUG0Z z+=rz~uw@Ps6yz4P3PIYSbYq^FHX3A1=`!Rm$lIvz$Df0`45TI%L=KyFA#nD~0G>ho zIdUD(0rSn?_K!|4B$zfmkHLq_e5c+?J??|1!8`+*RQp=S5%;;7z z(rG-6EoWdwjv*~LE)&3wqpTM?OhD=hP|J{f9a3$Z_cAPVAKz(6ejggAVa@j>3(j%$ z@cbS`S>lky%CQ9>%pQ{*Q^uzl-vvm~3%Rz<2vIhAa2};Olq603+`yY9^v8nnyd1nC zBctF>p(pxZ+VF0}Mm@%_=w}E2Mo^N1b=OSI5Ik}O?S}CjLfIf%(nidOOTWAx_~X+* zT)6m2K|b}Jk9S~I{gBdgUh2b>TBnUR5j;PDDkPm@#u2AZ#f#9jxI%_4<;8D=cAgrll|}MtwH-T&ta{#*S>`DSTkOj!x#Ou`DpFQ z5eM&K)}J_Lq#Sy1s?Q4OOx4fRh!O|>p2gFaTX`LD*;RuvL@Dfg=f$Hx_K7erMCT`| z)#F5|k_{!2g>ue3);8Be{e%69Y^k{Mu(FX4If-m4m(G=)L^_kr1|KJ~#X_Z$%Adhk zFp(`k%9YE`WGR=-kM8WCaIKgxmQbH4WzxaKdMT4B1QV(JRwg*Q(|snfwH8WjCQIqW zWmx(XSUUHL}xnBI1ZS@T`Vlq*hVBbm!P8<}!Bxt^)(Yq-DG7iF#yL8tr7 zJ~2bVurqc8g4T&w$6c=x6h&9g&d#1odHVXgteKvKu47&=e)4-m-hKMFWY4Fabyc1| z{Z(%JwvGMrn%UpN#c?7M3CJZwf2+nnRNMc$^}~j~TCI;@wJ)^Tye4@Tqj_2R{i4~uY6VlUX_Jv4rG{xL_0rRdb` z;%qb)_j-z;&$|9=*R5vr&lAYZ`NGaWd-IJu|JZf0+5BVc?fmm*wLjY8I1=w}8)Z(2 zww!wf-us?K?nXX`d>)x==F7<33w;~;JIJpfv)yZW^FG-2(dg||>wEB~9lnqJ1LUj7 zKSXAmAK`ti`c1kQ-+jn8khy0GA@?H>Acv6$ksD!+qI|4L-Mc8Cs;is9_w72mzCT{* zaW!QbUf=k%g!3V9PI>>yGiwQFMlb1AdC3P;n8euW>UJaIncE3X*)Tn$!yv8Vk9RaY zf#z__v1)E=_U;kgJ4Bba(eo|w8GOv+wLNqrzGh{_FPpnMp2F9wcb2QwyCB|4hLGuD zjng&kRH!`W1M}NQW3DfkGIp3+Pha6&%hb&N z#2vs{RBM0kqqX?-l`8~CYK=dw)}L#X7GL9k)IV(ePdENtAGP?Ls~#XTwbFkGtC@EE zxi)L@=___0(h7eVzk_J;xi8S-(^pP12#}iNPa*Dj1FxI^xF^x#zY5-u2#(YmUmyS6 z4{7l)g7h%EKrR03;B7zua{Oo1_}qhP@o9VR zA(@)%Pu;ox=;EK-87;nU{vp2mo2O>|;oke`@wxu%^yj``i%;LRACU+>D0i@zARKZgXX`+v@9T72GNF1p7g zqyOFwQDf9qoQ|z8+Bdo|=~ngp^Cl>}rc|p`uV(fSV(y*6ENDB_>GGr7ICe)7kX`uWd${PWh!O*Z~Hr`a{l)@k0& z#@}oF^WILY|2TM$I{vSEz~}#tJL>r7eWh0auOj}rrfTsgz_knM-~6Pm)F{V8oWmx|2)rW@uT2B_4^;s;ye$t&F2*A JMf%!W;Qy?8Su_9u literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..a0626d1 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -63,6 +63,15 @@ DOCX_TEST_STRINGS = [ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", ] +MSG_TEST_STRINGS = [ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", +] + DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -232,6 +241,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test MSG (Outlook email) processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) + validate_strings(result, MSG_TEST_STRINGS) + @pytest.mark.skipif( skip_exiftool,