From 5de769f1bcd7b32b095255da36e929a62da14054 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Mon, 16 Dec 2024 15:27:03 +0800 Subject: [PATCH] chore: excel improvements --- src/markitdown/_markitdown.py | 27 +++++++++++++++++++++++---- tests/test_files/test.xlsx | Bin 11562 -> 11770 bytes tests/test_markitdown.py | 3 +++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..daf1127 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -80,9 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): if href: try: parsed_url = urlparse(href) # type: ignore - if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in [ + "http", + "https", + "file", + ]: # type: ignore return "%s%s%s" % (prefix, text, suffix) - href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + href = urlunparse( + parsed_url._replace(path=quote(unquote(parsed_url.path))) + ) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) @@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ + def _clean_colname(self, colname: str | Any) -> str | Any: + if isinstance(colname, str) and colname.startswith("Unnamed:"): + return "" + return colname + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") @@ -514,7 +525,13 @@ class XlsxConverter(HtmlConverter): md_content = "" for s in sheets: md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + sheet = sheets[s] + sheet.columns = list(map(self._clean_colname, sheet.columns)) + html_content = ( + sheet.dropna(how="all", axis=1) + .dropna(how="all", axis=0) + .to_html(index=False, na_rep="") + ) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( @@ -629,7 +646,9 @@ class MediaConverter(DocumentConverter): else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool, "-json", local_path], + capture_output=True, + text=True, ).stdout return json.loads(result)[0] except Exception: diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 3a41e176eb860d6d78d92bcb2f00b2524d925df5..56ec4978178a08dbf5c627d2e2792c61486a7b25 100755 GIT binary patch delta 3944 zcmZ9Pc{~)}yT@nD*pq$9Iv5#)M0PPDvSb+hzKu0w%lfqqqbS?NO!g&(C|mYy6G=&S zl{IS(StcPn*Z1Dn>({;a{Pn!f^ZDbv&htE<^FA+Z5AAAYXt8;ptSu5j001pL0Kg0Y z0KzdcVg7f0T>brhu7vwyDlMk{rj=Quot)1!ht;%A%iL@pm|17uV|v69w{*G97$`l$ z%IeS#6;Oj6QUpGJ9Z$ZWXIZPC&M#1yqr&ePWBc{-Hk_F0do6gCI#6I_*;hD^Tzur7 zTRa@nuwc~n65GV8y=UaLH}<&*YhP`v4>m+|vG%IU=nU-Yw6QQ%ezLO& z0%`B2>}t_Fu9(u9SxtyS-za3IwrM^XSK--+x)PM$a8SpUIvEQ?1%h@$B)B6IEy?6J zpY1!OxMBWl)Hgo4i4EA%{!|SSbU-LoE!H#4IGWAbU?umde?15D`Zu#n6<{p-?HgMQ z@)%=#`9{T0&2SRY+_+uu3xnFO)(@?`gIYXGam;lLf2_NJDH%?>&mdJuNmei=-s`k5 zkL>6$=q@IxI2y9=x|Q=)(d_Y>xqV*4BV7*dX6Cn{Vi(!xAKPA8EUk2Y+^uaN)tI$i zfCDbaV9hTT^ZNC)_+HTSH9WC$|LlVA=eoImyN6_00nQhA#L@zi9un0U3!5Phnp$Xc zV@KwNswJK)v(`Mtp-XSb{nW{Q>Yy+Yi*EHP!pO^On2I6PsSw46h10dpbng;2f1JFp z1^94oG=zB)L6on3Og(3XBrQb<`Q4vd$LWAQY+R7Uvk@ZTg|KqlO*0WwBh%P5}T%-JWcdy{xhJ8 zF`SC$kc@tGPe66;*5r+zuKS&|$c~Lf;rN9Yc2UIp5{Z7y(8i)8!w+OOo-ttkT-=T{ zw!Zi%4OQnt?25;1r?F~09p^ND@?{q}Mt}N^%bdYt>W{-C&H40MZ3pi0!(faN4HPnt z_WF^m;Tk=4D;~Flk`b3;sTbve1w``HjCK3%lw zMNCH(UEmS%9=avP9}aU@HcGTeBVJS~%zq{FF+UHa(9x&Q_ zv&-UN(>R0%cD>ol#;hmGt^##3M=r>!LSSj2gk$L1L3D148A9z*Wn#CNM{psD_-@4&+~Z!_$caxPSHbGqhi=$Bx7O;1km1)2?{-dj7 z=+Nq|6-A^jc^ZG2FoGD{q0DmaBS5YCExu|wiv_o`$|5IlKrEu7$n@>eD$`4 zKG0YH{~~s@DppjK37+4 zDYx|>&52ns!4<;s6_>+4I?Mf$|b2lrn*amt7y3`*G5M1W3+ z0l}21p6v}r7>l3@w0IdWd#Lj(AheIli2t}%!9$2kV~NNkW>13X1uT3A3TjH28*r~2 z%*Df-P#H$Kl<8-uWqbQdhD|eF&rn#B+o})oU7NYqJGJjJ1^r$*rGc5?)aEYAkA2SN z>OaD!Gv)IQ?bi`;>|TK>@&?)$x?vnW8D0;qIGfG4&76d2JEW599!jf{Qw2p=gI0iN zKmg$E?A(OU&WPERw7jb65MjfHZTf)amnZU+&ho< z{vAua*=*0y&sb?ri3tvB>|+|_Z{S47l`(hX>JK^mcn0#Sl@PdX_w{z?E>7KwKEHg_ z(|nZ-m;8woZ@EkWcO}@)k$U6j>ol!9jM0u!wxlX6)28f@fNaSS>UIw@vCYbjF~V#DlQH4uWfFJG zu)-T9p215ZV=#p~wTv}Bg5+)iylCap>|t!nL}j_0tI4-W(flx#SUU|$av}3^ceF^- zZ6sIfVnsZsPy{wWTi^oY^R&JB!W~d}wEdarCI%^z^VNGaPLlgU{_KeMAmw`O^NX2% z4@*(kcN&L3`OW%G#iRt`XQIbIam2ujD5SI-1{NKP9 z{TxN{@(Xo|hvRW}?zSUiJ1fFrFKXx{i*jjHyxpN@ymDAiT1WU;QX#V=b99w1_l}eg=MprH*P&ZUH;*C+Jk%YqS>}I@j{OEr9-xzfp{u5Q>xbss~XQ!{% zBicx~D;zayyO0<2z_Uwfh>K4uqVqLQNyn9UA|K56j=;!!T256nQO`(L<8DE{`*_%7 z6DD$cd!X-XG9K2+7yDArT6vBsGvvb|Xuo&Sbj{_&tP24kShM;tfa1WXwC6-}nfBb| zE@9>V#gKf$wBNIwu$s``~owEc#2RUB3i##(zfpgt{(V#+s|Y9nS!ps8;IbUE1tRM_tZ zF>5QO&YO65nhn3c))VV?ad+Bs=Z}`zv0%QdN84I-wFUk`2ZUDy`K(41m{B1SrJ z6HS!c6kyVj?d&TZ$R@3pfR(0x_EPJWc@8u$$*kbyG}8UZBu4hq@i&eaXv1~hVT%YH z?(CAC-5JK+9_*NXhs$#hTOrtIQmOTm>wVZOgmL43on@K2te&&nl1W30vXOSFv-Hn( zT;bhs%?snoAjT+1_j}cu&#*D{$`j15lqLeJc0*z9E_p2UT+m-#Jc4hB1Pvb5K`a_> z7@l1!)=tq=l|}S&pRRc=PM09pp{{@C^83&{?)!?fId4z-0^+|d?0);cA zmFGT%_J}znGWFFH7ZUdrHKwyoJHDu}xw;L+Qlg zcKwkrQwgq^eJ<#=xFDFg-yP{3Y{)H#`GhTFOF&xp)m}C!_D^y%z!MMXQ&H1jtdr}( zpF!aFCUIbQrpv%Cgd?MEE`m0~eUVtIE?+$+yGfMZhoo(GNZdkCH)*P^p<9l@B=zc| zqZct1tqy@VY($j5^JZ6tgosiScrI8=k);BerilrF(W|+~4e6pKnpF(Okesj0};0ifF7=P(hOxH@| zVH6wL<@U2nVk7s$sobJBw@K;I=ktGd1_GEEg1{UA8UWpSGYJF!`4FyEb$;i@=UEm2 z9nj4mZF1K?AV|igzUnf|c@h4DL)J5D^YVsMbY@1YWsa4$B_$jG@(`Sc5n1I%ikYx< z@a)JQZ`Vxva@4-sx->gQ)&Bd?xHb0)C?Xt6PelmOjT|orK24(QO#a@&W zuiCM-#iqm-VDt_rZRljLPQ)N8VdB+I}^nUhsSY z`v9YYwvy_+ub(=$R|GkwBG^ehV)*;_~gmpNas+^O+BZ;8G@Dpq#Rj6Eq2%Umlgcp)XZ zMM#WrbRMP(6MjM@2qsc+DxxT1ON5V*CB?$`p8=-=09gMX;5>Rm=$3-=|KDc(-vXpK Ofl6A0Mi&0>=6?a}m@=;b delta 3750 zcmZ9PXEfYhyT(TyB@v@MqW36c3?k7PL=RCyv?%3(974&-pk8PJjlcSy{U&+o-F-E_~<#fGmzzO5OvX}Xw=)%vGNk>*?FB5QE-+$ zg@luWe#!U+kyG79rqHCWQDAE7ree6`TGgkE55>AG2{e5{N)m*Z9&8_mf`)!hqJN_@ za$8)qHGXEL0!=uw{WrS%-v)C4>xxh2LvoMUK!yWo3*qrQuLNr%&FIBz5O+(B;q_OGCSIF5GQ;1n6r<#Cx+u{p~0q6D_Qw_IqG?gU-#Q~~~?UQXteRFf;nen=NLZ)NzcsCwh5b}jOY9#zOwB_#hMfbU4~aL)R!Yb$S6V*h#(10Q2X zSaELF99MbRg<>KHpU{b``@&Qg(#w~!U+${(_AelRgC*QArPwH4YAv7hJmJ2xe>BIj zzZTr4dordo$)msj`hJ6PflKP05}zTaL9io8y#@D-I#&06Mo^b-r965o2fF!mNUPyz#rO?gM$zlw_H9*vaklRK?okBkL$*7Q-bRsp zt?%O$I7?|#J$cvrJ0NggE!+oRFmxyCQA-0g(5=jc6F$Pwf68Xbvzs9rDF!G`K7S_8 z6}`Z7eMyT|2x{sMEqw>Z(V`S@e#Z_DMQ%HH?U5x^9381OT)vy0ZrO}DWIgKKu;h|{ z`>cEI^|;)lYj>FH2)qFkfS$zQ>? z1JRHD+I!^r#QwDBG-pFE z7g2HbNF99mJ^QJq=W=|rZB{(;9*YGa1>`RHc-J-Fm^MKa}W z|84kRyGl3ML(ZpmNHk=+M0|am^JI@6t_r$*rS+C#3)w`!rvnIW$WpIbFOyue%7sHS zubvdoZ`;4ZeMj`&2g0h+c{;iKEn%mZ9NfGo7@LhO<%b)xC!s3Mt;0l5#jqy^a9)3| z`JTRA-^(QjsWT_}i;()L!yM{un(|pQ+=RgmnFW%XuQUdvbmXTN=>g;*&;v#g=sE~% z4P^teFaw^Y67(@dt)B|!^I$T;aS@v063vm#ouc|?WQ=VWi>rc+X&5n4M@JOJ$c$E+ z>L7kia`LVJ7U%x?iHp8QjX(i;d}XclKwL}97cD@9fX>W-y1mT}**)+KK6I+C_55KK zfk?Iu)tIGkz(~R^hv$Uq$et=B_-BAa{Qe9Rc9LZTA>#1VEFk=?ix0JdU%h1@zn%ZH*y#IGze0osEmM>9kerW%51I#P#CwO9 z-mLg3^&V_F=~~h_HG8N=y8Nr0po-#Jm{<@-eCOLpz;eT{Wi+$qsUuv@q-kxgYq)oY%3 z0BE^7A6rHXHJ*cgrzx4lF{I7ifTPIP`IV;6;u-`8MXn~!wT)F;5a{yq4?{06^V7a3 zW=haUKN4IfVe9TDHxn~%gEJJQ1lN|m682bV>Au;y=M(mAx^%p-%3liTxjRcdJzuIw z+ENU=IQBmY^(vC{P^sXyI>wi|8MGER&d;};6NuXZmqAT3+brudzbf2-waq&2^M;%R znW&Gj;q^KF^>xKJUJO;K5b-4Qfmr3FT~y}}W*@rN1&XPN~k}Lz`t`8xJa*OE@|R+_#=7K=UA7W3&G=0pJM*4qIqXJKRWzm|ULnh>JLaPKsA&BC z?yLQ@owYUEv>itMt_DZtD=)$GrdHmG zAtbNuDy#VST7^(!s`_2SeXYKfCNj&;07d}o%LUkf=oK|4dSU3w;*$p;x8^ zQw(0{P{q6jyl+=c8GRr2!4i*ioozMf?-OR#%6r z&Ihuss1as^6>|RJxesa;(6jv>y~jx@`*-W_0&G{Al_NW@4K%`EG*_`tk_9nQ&6s{9 zh6_{sG=aOU-6@Cd3T~Q7a^nK@=WNln-?VfwVmgj@B;OI98_rl=$8C{6>)tIR-M&Xp zi}^j=y4AHfB;DoJCgLbF%e$<3m86Qn_;`i9OPjFgUzUJ^vW9XS_shypNmP5UJH6gK z2VPpZzmS^FwywNK?>IUWFViJ2L}E^gm~Tvy@ROC^o45)j*?yLz&oC zTi^ZtThJD-Jmws~(w{!~ahJ^jgYf17&`E6wF~tDGr>V@IqfcTF@YM1XBs$_>k`7;A zN@*N7YQP@5L(em(0kW!{Fj=@Xu5Bhgp<3zoVh|OIKIsMDZ+x!m8uvd?`CqbNt>~We zFx{!QH%Aai!mbMxS3=ZuPEeWl?v1B>^wbYpOCGH{NccI4f0<8UEdbbC54S-&w}g64)oL`CY6dzR5sA*O zFZ(olX%q2;`9?Sibb_?(vv_Q*YPZxflkbGc_oc)b168B2&SH^;oY?EDVsFr>M!QH^ zr9y5{jW9Fr8nsl;J0X1%5XhxwQh<*HcyzCWhpB5p|BUV`9yV<8P-Aewub%<)?8{;h_jtT!Q{;YWASB?xU6-{ zy&329cS^F-aB;Zrh=iS(qQ$p}L*|dd$sdrpuiqsQuXap5zbd?gSsYfiW-a}A4bW=r zzRJQKdR10%l?}RU?fcaS9G}Su;Y>EpN*2=|je@cf8WT;0*&IFIs1O>6wI|MVa#5(h7&p|6*>AKcE{`)?%h#V-4T8Dnkn?feG!S1FXlG|4c{3ckPSLq9F1g#Hw~v+d6XT=uKrvLWY;nGLl1*$>4yi{YL~eyhxt+($$}tJMsBl@j-@}qk zT)#m|dr{^RZ^p0!?Shvz(Uy0vp6Z~N9*gd2*OMLoyp>I{@;GDf4Tkdeh@oJ-1|fx2HnC3@=${UB57~mAw+k-YCtAkS!@47kVCn-nA;@(^mC1mKmWEI4f4QbHf?w8*Z1Z-K~`<36uju> zyI3ps2hx8Xt7oXUrMGD2bO|7{%qy!*d{8fyiGa{PaN_;(vr3&G|>?~raXVK<;Eq-E?_5ivQ^ zTrRAG7=rW*H@1*R5IZ0yN;<@YE#?)$^1wt%$N8|8{OnjK7(M&HUHnISApbe@&x!<# ZfeCQ@ckTV(gy None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + # Check assertions for test_string in XLSX_TEST_STRINGS: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Check negations + assert "Unnamed:" not in text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))