From 5de769f1bcd7b32b095255da36e929a62da14054 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Mon, 16 Dec 2024 15:27:03 +0800
Subject: [PATCH 01/11] chore: excel improvements

---
 src/markitdown/_markitdown.py |  27 +++++++++++++++++++++++----
 tests/test_files/test.xlsx    | Bin 11562 -> 11770 bytes
 tests/test_markitdown.py      |   3 +++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 96997cf..daf1127 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -80,9 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
         if href:
             try:
                 parsed_url = urlparse(href)  # type: ignore
-                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in [
+                    "http",
+                    "https",
+                    "file",
+                ]:  # type: ignore
                     return "%s%s%s" % (prefix, text, suffix)
-                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+                href = urlunparse(
+                    parsed_url._replace(path=quote(unquote(parsed_url.path)))
+                )  # type: ignore
             except ValueError:  # It's not clear if this ever gets thrown
                 return "%s%s%s" % (prefix, text, suffix)
 
@@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter):
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
+    def _clean_colname(self, colname: str | Any) -> str | Any:
+        if isinstance(colname, str) and colname.startswith("Unnamed:"):
+            return ""
+        return colname
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a XLSX
         extension = kwargs.get("file_extension", "")
@@ -514,7 +525,13 @@ class XlsxConverter(HtmlConverter):
         md_content = ""
         for s in sheets:
             md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
+            sheet = sheets[s]
+            sheet.columns = list(map(self._clean_colname, sheet.columns))
+            html_content = (
+                sheet.dropna(how="all", axis=1)
+                .dropna(how="all", axis=0)
+                .to_html(index=False, na_rep="")
+            )
             md_content += self._convert(html_content).text_content.strip() + "\n\n"
 
         return DocumentConverterResult(
@@ -629,7 +646,9 @@ class MediaConverter(DocumentConverter):
         else:
             try:
                 result = subprocess.run(
-                    [exiftool, "-json", local_path], capture_output=True, text=True
+                    [exiftool, "-json", local_path],
+                    capture_output=True,
+                    text=True,
                 ).stdout
                 return json.loads(result)[0]
             except Exception:
diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx
index 3a41e176eb860d6d78d92bcb2f00b2524d925df5..56ec4978178a08dbf5c627d2e2792c61486a7b25 100755
GIT binary patch
delta 3944
zcmZ9Pc{~)}yT@nD*pq$9Iv5#)M0PPDvSb+hzKu0w%lfqqqbS?NO!g&(C|mYy6G=&S
zl{IS(StcPn*Z1Dn>({;a{Pn!f^ZDbv&htE<^FA+Z5AAAYXt8;ptSu5j001pL0Kg0Y
z0KzdcVg7f0T>brhu7vwyDlMk{rj=Quot)1!ht;%A%iL@pm|17uV|v69w{*G97$`l$
z%IeS#6;Oj6QUpGJ9Z$ZWXIZPC&M#1yqr&ePWBc{-Hk_F0do6gCI#6I_*;hD^Tzur7
zTRa@nuwc~n65GV8y=UaLH}<&*YhP`<E9W!KO$*>v4>m+|vG%IU=nU-Yw6QQ%ezLO&
z0%`B2>}t_Fu9(u9SxtyS-za3IwrM^XSK--+x)PM$a8SpUIvEQ?1%h@$B)B6IEy?6J
zpY1!OxMBWl)Hgo4i4EA%{!|SSbU-LoE!H#4IGWAbU?umde?15D`Zu#n6<{p-?HgMQ
z@)%=#`9{T0&2SRY+_+uu3xnFO)(@?`gIYXGam;lLf2_NJDH%?>&mdJuNmei=-s`k5
zkL>6$=q@IxI2y9=x|Q=)(d_Y>xqV*4BV7*dX6Cn{Vi(!xAKPA8EUk2Y+^uaN)tI$i
zfCDbaV9hTT^ZNC)_+HTSH9WC$|LlVA=eoImyN6_00nQhA#L@zi9un0U3!5Phnp$Xc
zV@KwNswJK)v(`Mtp-XSb{nW{Q>Yy+Yi*EHP!pO^On2I6PsSw46h10dpbng;2f1JFp
z1^94oG=zB)L6on3Og(3XBrQb<`Q4vd$LWAQY+<gw&d#b&c51e94nE=(qN|BVaUAL=
zY2iz^vQv)3`C~wy<}BZ(opSHyc`>R7Uvk@ZTg|KqlO*0WwBh%P5}T%-JWcdy{xhJ8
zF`SC$kc@tGPe66;*5r+zuKS&|$c~Lf;rN9Yc2UIp5{Z7y(8i)8!w+OOo-ttkT-=T{
zw!Zi%4OQnt?25;1r?F~09p^ND@?{q}Mt}N^%bdYt>W{-C&H40MZ3pi0!(faN4HPnt
z_WF^m;Tk=4D;~Flk`b3;sTbve1w``HjCK3%<y##T+{Jv|P<qnCXU+IZQE4hzeZ>lw
zMNCH(UEmS%9=avP9}aU<XHK<QER8{6jYcP$y_>@HcGTeBVJS~%zq{FF+UHa(9x&Q_
zv&-UN(>R0%cD>ol#;hmGt^##3M=r>!LSSj2gk$L1L3D148A9z*<K_^;`n}dnv~-J&
z7O>Wn#CNM{psD_-@4&+~Z!_$caxPSHbGqhi=$Bx7O;1km<lCHtrJ?4r>1)2?{-dj7
z=+Nq|6-A^jc^ZG2FoGD{q0DmaBS5YCExu|wiv_o<J3;k*3`i;l0|U_5WoiH*nS&tA
zB!ne6_)kMP5-8#u2*@V-INiL0^LehV&f084^|V6Qdszk^<^T=F4HCuqE3$OvW+UcJ
z)OCu=ueu0jZ=H+I1@cCk1XvlE*TrCRo#F5hea+5v@w&f3^=%Nfdi*oywV@)*Tj;%N
znqOf0R7tOm=)7w`ZZhSzeP|e-*K@D@A&A9pq`m$^C3B=>`$|5IlKrEu7$n@>eD$`4
zKG0Y<n3orELkhMj8)Mb~B~@P9uglwS&q0k{xB0<@f!N9GMAN4_r|mu|-|g=IQ<R~?
zz3jGSwAHVh(|Kl~Z|IVG?J@q*r-ba9F_PzAO7g?tf-|FRw?Qd>H{~~s@DppjK37+4
zDYx|>&52ns<FxxWGL!m*kEDLBl3>!4<;s6_>+4I?Mf$|b2lrn*amt7y3`*G5M1W3+
z0l}21p6v}r7>l3@w0IdWd#Lj(AheIli2t}%!9$2kV~NNkW>13X1uT3A3TjH28*r~2
z%*Df-P#H$Kl<8-uWqbQdhD|eF&rn#B+o})oU7NYqJGJjJ1^r$*rGc5?)aEYAkA2SN
z>OaD!Gv)IQ?bi`;>|TK>@&?)$x?vnW8D0;qIGfG4&76d2JEW599!jf{Qw2p=gI0iN
zKmg$E?A(OU&WPERw7jb<Jx*dDLGt^|V5EHs*W1XO-X$@umRT_0xz0Bif!Lv5)03*h
z5ZBDz@b~tSM}ZyR-X6qgG!`MS6pgt(4^rESDt4w9+hK7uE3`C})l9*@-3@&mGc&jM
zj=K%fn5BOhGCD}&Iq)sP6a1O+W|1v7wtP|i0M!$M+^PO>65MjfHZTf)amnZU+&ho<
z{vAua*=*0y&sb?ri3tvB>|+|_Z{S47l`(hX>JK^mcn0#Sl@PdX_w{z?E>7KwKEHg_
z(|nZ-m;8woZ@EkWcO}@)k$U6j<xuW9D0hjD5&>>ol!9jM0u!wxlX6)28<B@1?}K*d
zJS*9pMnS61)oj_uDf1b;2)a-Y1+Czyv`g-SWI1V-D)vu&SBSINDKrx=UA(O-ylxOy
zm&xR5$6(pSVLu|VN}V-+pjm6@r0}Ayue>f@fNaSS>UIw@vCYbjF~V#DlQH4uWfFJG
zu)-T9p215ZV=#p~wTv}Bg5+)iylCap>|t!nL}j_0tI4-W(flx#SUU|$av}3^ceF^-
zZ6sIfVnsZsPy{wWTi^oY^R&JB!W~d}wEdarCI%^z^VNGaPLlgU{_KeMAmw`O^NX2%
z4@*(kcN<G$g2zOLBro92k3an82ar_&>&L3`OW%G#iRt`XQIbIam2ujD5SI-1{NKP9
z{TxN{@(Xo|hvRW}?zSUiJ1fFrFKXx{i*jjHyxpN@ymDAiT1WU;QX#V=b99w1_l}<f
zOi>eg=MprH*P&ZUH;*C+Jk%YqS>}I@j{OEr9-x<rzlSQ0&3~FmyP||Pt1h2!t+yr_
zFoE;fU7m%X-dXs9vswoQgji6|K_A<LxE;$hOzw(&GBB4I!>zfp{u5Q>xbss~XQ!{%
zBicx~D;zayyO0<2z_Uwfh>K4uqVqLQNyn9UA|K56j=;!!T256nQO`(L<8DE{`*_%7
z6DD$cd!X-XG9K2+7yDArT6vBsGvvb|Xuo&Sbj{_&tP24kShM;tfa1WXwC6-}nfBb|
zE@9>V#gKf$wBNI<ED02=pK6e~coh$J4%5crJCV&V{se4f3tPXE^-&YA`ZF5_VW=SH
zm9stAN;w`t?C<^FDlciKMc?0KV51h^da~5YHCqNjVA~Ia7H!_Q=IYOCY9gKcmVPeu
z|9&#?r3o;ot>wu$s``~owEc#2RUB3i##(zfpgt{(V#+s|Y9nS!ps8;IbUE1tRM_tZ
zF>5QO&YO65nhn3c))VV?ad+Bs=Z}`zv0%QdN84I-wFUk`<D>2ZUDy`K(41m{B1SrJ
z6HS!c6kyVj?d&TZ$R@3pfR(0x_EPJWc@8u$$*kbyG}8UZBu4hq@i&eaXv1~hVT%YH
z?(CAC-5JK+9_*NXhs$#hTOrtIQmOTm>wVZOgmL43on@K2te&&nl1W30vXOSFv-Hn(
zT;bhs%?snoAjT+1_j}cu&#*D{$`j15lqLeJc0*z9E_p2UT+m-#Jc4hB1Pvb5K`a_>
z7@l1!)=tq=l|}S&pRRc=P<rjIVHry^Nje+0?ULaZcf2N$tv+`JKM|B~KUL04q8$EW
z>M09pp{{@C^83&{?)!?fId4z-0^+|d?0<YL!|-A}ghQv<)%wO({0R@VqLx)lU$xFi
zwrMsx{T5V1U_3|Vb;8?@fUxFJlZas)NszMsIRodmSrJJl@#@xLjk_AW{hZtjUY!hQ
z9GkFp4RCZICx0c92ggv$`Dr*acJ^6Slob2j0$3J(Bhoww7qMp3F;lGJ;L4#_21)`$
zhl9OZ+NnM^S-cQ{TbNX!f^Oi;rJsn8vkY%HB?)a&G;K^?LpYyMPg&}wsG74rXXEDk
ztcJNWLB~0#lb_4V9Qok^3ja|^nS`8SF&a9ft@q|AW2Vo|Rb{d|_iL|u9;gTq>);cA
zmFGT%_J}znGWFF<NlodaC+kn+yZDb<82jubuc0!6o6cfTcAk?H?so@|mh-cq{T-F<
zGQ(w+7D;+6BFr2$Iu%JA9m|r~xa;T~w#pZrB`W|L>H7ZUdrHKwyoJHDu}xw;L+Qlg
zcKwkrQwgq^eJ<#=xFDFg-yP{3Y{)H#`GhTFOF&xp)m}C!_D^y%z!MMXQ&H1jtdr}(
zpF!aFCUIbQrpv%Cgd?MEE`m0~eUVtIE?+$+yGfMZhoo(GNZdkCH)*P^p<9l@B=zc|
zqZct1tqy@VY($j5^JZ6tgo<YGSneVvm7(Dx!G@LDuke0zd2Br!!jZAqFLqhq)Xa2n
z*#3*t%{&v3>siScrI8=k);BerilrF(W|+~4e6pKnpF(Okesj0};0ifF7=P(hOxH@|
zVH6wL<@U2nVk7s$sobJBw@K;I=ktGd1_GEEg1{UA8UWpSGYJF!`4FyEb$;i@=UEm2
z9nj4mZF1K?AV|igzUnf|c@h4DL)J5D^YVsMbY@1YWsa4$B_$jG@(`Sc5n1I%ikYx<
z@a)JQZ`<Db`F(+ZzWMnSRC@nO!GOLZ8VY=AU%VkJjz20|RnVt0u+v+TT3)^oH<MK3
zwEv}z=K38v{f1g$aZcCzGWl|bJC{6O`J$f(cV_aKVX-CEQjIvZ=Z-$LdgK&oiF_6e
zP-iM7!7aePdEaKNyCO~I`o@qEnCm!ljHe3PYnMb@)y`Pmv#bgC9k~xb2sCw8F8CO}
zJQEkO%R7V%R5dz4>Vxva@4-sx->gQ)&Bd?xHb0)C?Xt6PelmOjT|orK24(QO#a@&W
zuiCM-#iqm-VDt_r<bfUGp{HWz03PL3T=7Jr*@pZVMp+8AYzs#y4M-1TFS4n?OGoOj
z+Sx}e7P`y}1cVAWDSqeEpJcMD!*uE~&pAK7VmS%#keGmXm||Ytdz+}Xi%tqY`-QQe
z)r(9^l|3xoFbv21*+h!8SIyUb2z$I^(o>ZRljLPQ)<DkX4Z4}>N8VdB+I}^nUhsSY
z`v9YYwvy_+ub(=$R|GkwBG^ehV)*;<B<xBUfj=_<0E7(&UV<~6h5s+-0suVcrT@E_
znFu4SoP=UHC;#7f;{S_$*v>_~gmpNas+^O+BZ;8G@Dpq#Rj6Eq2%Umlgcp)<s$>XZ
zMM#WrbRMP(6MjM@2qsc+DxxT1ON5V*CB?$`p8=-=09gMX;5>Rm=$3-=|KDc(-vXpK
Ofl6A0Mi&0>=6?a}m@=;b

delta 3750
zcmZ9PXEfYhyT(TyB@v@MqW36c3?k7PL=RCyv<SxNU6g2lBZ!DPBOwThGFtQ?QKPpI
zL??Qb2xGJn_2hZZTF-mlv%l=U?tQQQVek9?UDrObDzU7Dg8}6IAplJV0v%C-K-WMZ
zP>?%3(974&-pk8PJjlcSy{U&+o-F-E_~<#fGmzzO5OvX}Xw=)%vGNk>*?FB5QE-+$
zg@luWe#!U+kyG79rqHCWQDAE7ree6`TGgkE55>AG2{e5{N)m*Z9&8_mf`)!hqJN_@
za$8)qHGXEL0!=uw{WrS%-v)C4>xxh2LvoMUK!yWo3*qrQuLNr%&F<I`VkT@Yyp13f
z>IBz5O+(B;q_OGCSIF5GQ;1n6r<#Cx+u{p~0q6D_Qw_IqG?gU-#Q~~~?UQ<GwcesA
z->XteRFf;nen=NLZ)NzcsCwh5b}jOY9#zOwB_#hMfbU4~aL)R!Yb$S6V*h#(10Q2X
zSaELF99MbRg<>KHpU{b``@&Qg(#w~!U+${(_AelRgC*QArPwH4YAv7hJmJ2xe>BIj
zzZTr4dordo$)msj`hJ6PflKP05}zTaL9io8y#@D-I#&06Mo^b-r965o2<Kmuw*NGO
zWGvhq5EXUJ-H4r}$*(K8D6Mv_(xL!E*cngoanv*+U;*+agkE**mmg$C(NzpTK67Z)
zz!~_mQnK#TD(e+li_U*x>fF!mNUPyz#rO?gM$zlw_H9*vaklRK?okBkL$*7Q-bRsp
zt?%O$I7?|#J$cvrJ0NggE!+oRFmxyCQA-0g(5=jc6F$Pwf68Xbvzs9rDF!G`K7S_8
z6}`Z7eMyT|2x{sMEqw>Z(V`S@e#Z_DMQ%HH?U5x^9381OT)vy0ZrO}DWIgKKu;h|{
z`>cEI^|;)lYj>F<?dR`lZcQ9~@imQz3f{e$;JE|ux1n8p5ps^mp>H2)qFkfS$zQ>?
z1JRHD+I!^r#<E``-ft%~j1%*eRU|q38@x=e1ScM_E#K~u&qaDEd<bJ#5YGsaMtDY$
zF|7qe1i#)sc^pno=1!C_D2@95lU1Y!Y`0PV5Z+nG=z6WeDA2f8srRL7>QwDBG-pFE
z7g2HbNF99mJ^QJq=W=|rZB{(;9*YGa1>`RHc-J-Fm<f9!JrGHCi&?;`oyA>^MKa}W
z|84kRyGl3ML(ZpmNHk=+M0|am^JI@6t_r$*rS+C#3)w`!rvnIW$WpIbFOyue%7sHS
zubvdoZ`;4ZeMj`&2g0h+c{;iKEn%mZ9NfGo7@LhO<%b)xC!s3Mt;0l5#jqy^a9)3|
z`JTRA-^(QjsWT_}i;()L!yM{un(|pQ+=RgmnFW%XuQUdvbmXTN=>g;*&;v#g=sE~%
z4P^teFaw^Y67(@dt)B|!^I$T;aS@v063vm#ouc|?WQ=VWi>rc+X&5n4M@JOJ$c$E+
z>L7kia`LVJ7U%x?iHp8QjX(i;d}XclKwL}97cD@9fX>W-y1mT}**)+KK6I+C_55KK
zfk?Iu)tIGkz(~R^hv$Uq$et=B_-BAa{Q<o-1#r`j%4pmVm*5WRv=}a`g2pJjOih|}
zi>e9Rc9LZTA>#1VEFk=?ix0JdU%h1@zn%ZH*y#IGze0osEmM>9kerW%51I#P#CwO9
z-mLg3^&V_F=~~h_HG8N<Is_E`p1XiGqO3V|)-T}g4itWsDFo2HqO(9j)CxaT1!yY=
zs2k<lm+`(oEGL-HwOcD7LlzvP-z0v8jc(?@3Thu=_8Go^pbT|-qB{L}RAx9cs_YVi
z{mk<9ErJ}85|5e6p>=y8Nr0po-#Jm{<@-eCOLpz;eT{Wi+$qsUuv@q-kxgYq)oY%3
z0BE^7A6rHXHJ*cgrzx4lF{I7ifTPIP`IV;6;u-`8MXn~!wT)F;5a{yq4?{06^V7a3
zW=haUKN4IfVe9TDHxn~%gEJJQ1lN|m682bV>Au;y=M(mAx^%p-%3liTxjRcdJzuIw
z+ENU=IQBmY^(vC{P^sXyI>wi|8MGER&d;};6NuXZmqAT3+brudzbf2-waq&2^M;%R
znW&Gj;q^KF^>xKJUJO;K5b-4Qfmr3FT~y}}W*@rN<wKMB<-&Af;JRw7Em<zB$!z5P
zB@xAMPRu9nV}a;7v3R4S{wd4ko{y!ywBwg}gQ<{ZIbw2BGy(LmO6rDo9o+ksQepDC
zY>1&XPN~<Nm2tw%yMPD;M>k}Lz`t`8x<TUi$X8M&0z}3w_QYwPqzQjT+=RtfeYji2
z+dP*NqhcZ>Ja*OE@|R+_#=7K=UA7W3&G=0pJM*4qIqXJKRWzm|ULnh>JLaPKsA&BC
z?yLQ@owYUEv>itMt_DZtD<AV;&-Q!rz8s_v<e8AQW~-<3bXuYWgpOk?;w4<p_&E|S
zy<?*pSKX9kS0`!l#-S5&`GjO|*<k5aH;7GYMn@GzQxvgQ+BGz4ii{)Qjr!~$*kMsK
zRh_cXG6+?ogA`u!zAM<m5|t#;R=T_F(lI$@S$nHo&7Mi4_H|}1|LYFEUEhQ<l+lV<
zXPt!FI;{QSND2p#)UhfUUIgP#XiW}MuL?~MG#oWu?y*;m<1aW4t-N(Z-Cn4Rfkj0`
zn&uhm$B%4Fu7qyq_9a=#bgd<&3CkdYHj^jR(OFuPkq*!HBAm&_l?mDY>=)$GrdHmG
zAtbNuDy#VST7^(!s`_2SeXY<RZU5x<C@xRN?Q)$Gco&a%AO^P**a{{0JR)OYO;d|t
ztq$cKov=Z_FoT>KfCNj&;07d}o%LUkf<PAFzrpdZzWX_&P>=oK|4dSU3w;*$p;x8^
zQw(0{P{q6jyl+=c8GRr2!4i*iooz<awZ*BVZ-=_f06Dz6Ga@oZDUcp>Mf?-OR#%6r
z&Ihuss1as^6>|RJxesa;(6jv>y~jx@`*-W_0&G{Al_NW@4K%`EG*_`tk_9nQ&6s{9
zh6_{sG=aOU-6@Cd3T~Q7a^nK@=WNln-?VfwVmgj@B;OI98_rl=$8C{6>)tIR-M&Xp
zi}^j=y4AHfB;DoJCgLbF%e$<3m86Qn_;`i9OPjFgUzUJ^vW9XS_shypNmP5UJH6gK
z2VP<r=GdI%xxrF$hoK&T!CDl7c5~)ZKRuO!RzcEpQ9<<7TF=GD4u!8g!&XjH+`bUE
z^3<?eQgKnOvA-P>pZzmS^FwywNK?>IUWFViJ2L}E^gm~Tvy@ROC^o45)j*?yLz&oC
zTi^ZtThJD-Jmws~(w{!~ahJ^jgYf17&`E6wF~tDGr>V@IqfcTF@YM1XBs$_>k`7;A
zN@*N7YQP@5L(em(0kW!{Fj=@Xu5Bhgp<3zoVh|OIKIsMDZ+x!m8uvd?`CqbNt>~We
zFx{!QH%Aai!mbMxS3=ZuPEeWl?v1B>^wbYpOCGH{NccI4f0<8UEdbb<oho`7gZzus
zqQgqs8cg&I3B|Fa+uP8DamVWl_ql{&HAy@}2cd`Z3x|NW{MJUNUX0LCWnx@-S&?tF
zIrOJB?_`{#k*!RZlx{OJb!Ha4c7j0jLY$?RI+D>C54S-&w}g64)oL`CY6dzR5sA*O
zFZ(olX%q2;`9?Sibb_?(vv_Q*YPZxflkbGc_oc)b168B2&SH^;oY?EDVsFr>M!QH^
zr9y5{jW9Fr8nsl;J0X1%5XhxwQh<*HcyzCWhpB5p|BUV`9yV<8P<ygVWg4q@_x;GR
z=Ax&Y*f6MOwmz+H_WjT?)%qOw`dsqYz>-Aewub%<)?8{;h_jtT!Q{;YWASB?xU6-{
zy&329cS^F-aB;Zrh=iS(qQ$p}L*|dd$sdrpuiqsQuXap5zbd?gSsYfiW-a}A4bW=r
zzRJQKdR10%l?}RU?fcaS9G}Su;Y>EpN*<h;P(sjLxg|V2Z<k?swtZ;(^3?C`15E?d
z-(igMoXd&R##fg6n*GN6rs_OK4Nk7}s&sh`^sKkIT5ozjAT><UYm1`TNq~6@J{W0t
zFN+CF)KaVO%ur~(a%yABgFk9*S_gs`k8V-Q_6n0D)f4je<UawzEJfPXgZ93h*!TG+
znqKC*^Ns7kmq68LwEk2CxMXu&p8(E6N9y)Fu|dsF{<pn^>2=|je@cf8WT;0*&IF<a
z{qg!BUeFDUV>Is1O>6wI|MVa#5(h7&p|6*>AKcE{`)?%h<dT!rL<mum0BxSL5?5mI
z6i0M1JBh2(dNT+!$$a+UsbP>#V-4T8Dnkn?feG!S1FXlG|4c{3ckPSLq<aEgc!OFU
z%UxB4OsspiHmTVswOOP=_L0gcJBB>9F1g#Hw~v+d6XT=u<B3K}MOP}F50$(^%=R@n
z=^`3eR!5^cBat^1g56`>KrvLWY;nGLl1*$>4yi{YL~eyhxt+($$}tJMsBl@j-@}qk
zT)#m|dr{^RZ^p0!?Shvz(Uy0vp6Z~N9*gd2*OMLoyp>I{@;GDf4Tkdeh<WNL`QkzI
zXKqA*`mFBu(?wXhmElGx!$FR0otzGx6yg~cYZeTHX}&M_&_0<0q)D2Ox6ky7KoZ_;
z>@oJ-1|fx2HnC3@=${UB57~mAw+k-Y<e_)XJqvmO2=NLTKB8vpxLRaEyWevfgWA^!
zKBg&sT-9&3>CtAkS!@47kVCn-nA;@(^mC1mKmWEI4f4QbHf?w8*Z1Z-K~`<36uju>
zyI3ps2hx8Xt7o<F%Dm~rVUDP}tiC$EQ%GN5dJyX_hny6fZ_n@EJUxHW3c^A~lWG3*
zmBV(4>XUrMGD2bO|7{%qy!*d{8fyiGa{PaN_;(vr3&G|>?~raXVK<;Eq-E?_5ivQ^
zTrRAG7=rW*H@1*R5IZ0yN;<@YE#?)$^1wt%$N8|8{OnjK7(M&HUHnISApbe@&x!<#
ZfeCQ@ckTV(gy<XCQ5YYkoygz8{{gZ3{XYNz

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 94fd886..edbeefe 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -120,9 +120,12 @@ def test_markitdown_local() -> None:
 
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
+    # Check assertions
     for test_string in XLSX_TEST_STRINGS:
         text_content = result.text_content.replace("\\", "")
         assert test_string in text_content
+    # Check negations
+    assert "Unnamed:" not in text_content
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))

From 19dc6a36410a1c9d42fd961b34a1d6ad5b0406e2 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Mon, 16 Dec 2024 15:44:30 +0800
Subject: [PATCH 02/11] chore: update test excel with a nan

---
 tests/test_files/test.xlsx | Bin 11770 -> 11739 bytes
 tests/test_markitdown.py   |   1 +
 2 files changed, 1 insertion(+)

diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx
index 56ec4978178a08dbf5c627d2e2792c61486a7b25..0dcbeb9b15bc026e88c46f179e8df38705cc70b1 100755
GIT binary patch
delta 2611
zcmV-33e5HTTiaW(F9!up$a!RjlQah-f1v*$=v^VHt1Uz+Y)irhNQ%I5lkHPLOLRn7
z5?zvx9Tffd4&`g=Md79uY*Hjghi7IEhnFAMO>Mm}skmqpn_OVqVr?Nx-aaSx=SlAL
zY)fg{mb4ZvOYB#s?2lI;{<&NUIiCtKw*a73iCt;Eh!Ijn#Tu$E#DcZ3G83|)e;S7J
z8L5S2v{V&ix~Y+iaflkqTYG~L%X2WqY{m<gielNYR&PKgtEmS1s^SZ^Lu-n2;2J9D
z%Y{>jW&sdWUUU80f!bD6#MjTQkaSvu+iMb7YYD#){scqg?skHuldT$FNTI|`UjQ`P
zkUsIBU_{895UfvQa2}G6Bzxf|e}{WkGJGeiaE}!pzzF_hGy>A<v=T$`-ia3Mxw-b$
z<&4+tVIy6OE*76?V=`6SR!x=8O0HR%*bxlHiXA4vE4f?@m%N5m0uQ#))n3kb5(bd_
zL9Ll=sb(Y5YLMEN`k%_G!yZ)vL~M=O*Cm$>&JIWj+<}%BF`cSAsw-<Lf9u3XpB4Cr
z8bS$5*$b=TSV@qKoO4}@Rf{w@rn>gYL_?t?RnpTV@Ef|Mg$WCSP;BUK-hU1Yu$J*o
zdv{t|@P3`%K>Y9N3uFUwUTzQTwF$TP)D|)(Pv1x{9F4pzcRZ4hoSyFwoME01oWRY=
zsD~r3mwDg885zcfpi5nCe-mw>6WfQZo-Ezc^=^gWc*)BHjc<7SIHv!p*{yvy4x5tp
zz}ZS2Bw>cu`jNK~ad07W!Kv5%u;;^Y)h#~qQdf!Xfu#0x|1$nu0b|nl&37Pu!z{7i
zFiJO!a(EomykUfnD0RgXsCB*7c7ne*1(iURHhmXl5O{3%kk=(Ke?A>s<30{CaeZeR
zu*mTv5;^@6M~+uuKV;J*7$|Ui7`M~HZuf%L#htWF*X`FQ*zFq&w$?XFcRlErgO|j|
zg9!J12WMX3_`SXhx%OOV<fm>HL|K{*1CwTRhs1xQl6J=^-d#?HWksc)NLtL{N*lA;
zkSfq+7Z(J4kI4>$f8G##K+(@h?)apSo#8O_ogmG<AR_4~3-SY5<NoZOb*zWFuNl=#
zC;;Z{cjMT!`F3_cIonKZ)BWZM$K%w*ef#yV4DZ4Fnw@E!Kb&be`gA+FJ=69kn>;<{
z=Nb=ghw0#a`@wiT_%g}<+iijVH=vt@o3@iF+R5bV2LJ#7v(XAo0uA3Bi3kM%06!D}
z02lz1Tq_!XoRdv&+b|4<?*sN91l?o#BPmiNPEoW7*4wbI+gX<BScpChNp6x2`|qQi
z#Tk%GP{XEd(RyA|q*f0Ht;jn#FRiI)mZX$`5w?+LThX`go7s{QAGm3_vIZ)81WzB<
zpFgek*6n-;5C{^CuV@!SUoz%J2b%lD_F&M{TBmt`z)!bjzIVWzagkOmPt!Z5xis`d
zDBUd#+qP1`vlW8|6HX8ZiU*9}ce3|qnijX<G<Un9p9!mbMATBra2%nOXi>gwjdi?M
znD-%@b3qOc?;M}vEHlnuDQPL3^|lQOlG!Qd>-6v0J>y~`UFUxjCYv({?=lwfg33P@
zQnr|XSosB3d=Pdw!Qytga*$0$f2aSh8ID;r>BXFVf9QJENUU(oia2O1`k0lQd`{VV
zH8#)>348zFI|;mg1qC7iie!}hvR1ze9&48^u!&>8*4FOg!WaA{jVN}-+Km1B)sdQe
zc(UrJY(j?!JQoeLd{E&sxlEQto)%fYn4I5#?EVYLZ5N^{CLOz`Y>v<11-2`yCVA8q
zR$=gHiI#ZmaC<aAjN|^KR}@JxvBltn)#pgYxJSTW00030{{R30|NoSjVUF4`5QVRT
zI6%qRPDoZPQX41tKoqIJw$*}Yd;5mW3yI$lss6Y6c49w|$Mb9s*B_V5EuL<t<Kgpv
z=kLwu`&PD_>#x(tb@=P8+HU?xIGx|V|BaXH`SNjl-xgoo_INlyLccy@$1etc9q9cS
z4!isDusb8-+Z3?_XzeFu`bo4ART|ORi2XBSH!S117MHP>$Exnfx=H71fgOodEJZ=m
zz$v^3_OE6744HnExu=x0idlVB?B{uZ$9?M*2UZEZqB3Nt45P}5vePQAsYs$?q11P;
zaj&>sQCViFEH#(3<4P-16BP?@IPhG6V}^SSvLbRgSO0y+Dy>VJpao`?z<gp9mz4Nk
zv6s0JlH;?_khvdaI;yl*F>kn1uGS7?P($d=(22-8Lu9QR&Sk0Cl%?W|PD!VKxTMVY
ziv7IQL}inqve6yqR4VpV2vKq1Bd*v<THHMbS@1-r6gl0$&~(xq?J&rrm5OQX%Ilij
zxpL3#2;&4lPmqeuvMSL6<-Hf>yvq^X@p+%(`M^&+tXPrcc+n92$Y8~X9$YAcapIKm
z4m?nN4_@!*f25hsaU^-|CM~#ssR9k>>gW)@SaT{V$9@DJLF;Y@zIk3%xOL(q$LAfK
z&pWUWbFWPW=Qxq5Uj)8=fH!oyBe=O3=`h~xH@K`7XO1BWJcjbrf;&^38c$uKL%272
zg(-!#Q=j8XqTbO^5w_r^DNc>&1J)tFHZ2PAejX?-f*f}eIE!gf$`^%yHr_EJ;v3UX
z;b&AGJ>q3mY>r6@oWwML_z_JW5x;1UdKCq?nZ@|knxncLs~UpG*9oT<yy7|lH&;^d
zg(K;8Z{p?JLnOzloLZ78T5u~C*2eeeDc(FrG=4R)S+6EI6?a;ve31{WS@6Z&S_V@*
z-%!DuVx=3n9H$zBA63vZ@Z$-*;+mZ}->tyKh}H33Gw($D9L@1ChsP4%0l<wVOq}lk
z(>PT6J@}%;RqJm4N#Gx|4HCf;1k#R=aI?}RH3119D?u&$pzNZBlMp5wf5gj!EWIz#
zrl?ecNDv|q6t6^~@&vtPD}RAJW0?}HtP~{Z2nH?F>#La8lvf&_bX9}41sKnfl%c#%
z(9YVLVr=*hBr}0?C?{?!Etz#ko#UGEPnH9Y!f=iySSFZdxCQ$4Ktv~`-~+PW>#_p~
zj-dnz${I`pf`7agv^0MXe{{;oDGPh7U8hsrF<bCn9NY&}492Euf@adiaQO*-+pM3v
zs(zu`U2#BZEI8#Fn5}erE12|F)m|$&#&{&OODvhO8@KOm0pjkMK0@QI3-6t&JjTCe
zhwWPQY!IHyMf(<K<#lrZkgZS}g=FT3b3dGAWI@Rdjlz%i^^5s7A7_6dPZ|CtMpK_e
zKAC6X4V^6LbWA)0(k}Pq-$=g!vq&d%0|iaUd1Qu@z$#J!)RPk{E*jq)i3kM%06!D}
z02lxO00000000000000ElUplA0ZfyjD@6f`ljkcy0Zo%AEII)~lVdDF0&F0YrywPh
z%PcAaZXuJYAt{p}Eh++aBa^Tr8<S@(5F65tk8oE3003wL000pH000000000000000
Vp_7s=J^?S2(Jdne5-I=y003qu<eC5g

delta 2646
zcmV-c3aRzmTl!nDF9!vi%ve>2lQah-e_;PX&^tp?res-&Q&^US4UiOp<0iXL0WHxH
z5na3_9Xlxc?;Xn5)Qg3iR<KEt937sSIUHVoTGy5HDwLLOGj!<%aUId*Z6TZIq5EZ;
zq0n`-;Z4D-wh=@3O=$Pi)t`S{u3GgnYugtG02)1XOJf!h#yT%W&GkjQ5Dl!%e_K^^
z14H$U^+E|==u!w%SJ)#Yz%`eRyFrN5IT-DHE_0Ez`LY&`*?=ffaRc;qDHnQ&R_Euy
z)m*(S7btJ*1whPXCCyt0>N<5ET|YOi;<F0eUQ^#$EBFQQrvw^rw-YR#Y*ou#wYr^~
z3xLKO(kK2?f+>9$g7s+(&O>5Ye~DLVbGT=vf)BC^_E^CIjFMkQqadwLD-i_mgJ}Mq
zo9A9#&SfPYHqv$YV)2>RHdB@BR9u_1kVX_kw-3X1B@Pqdm0B+1rL16;!b4nqwU_gq
zf&t`yR2iWfZp6543`lKD{a0nxVUNoeMC?q&ze}kEoE?x5xC1TEBR<o2f83PLQdL72
zf6?F{t|_ND5wD_xW2Hbc>P4EOT{YN%W4ddfY&0}JQYAk<0zaWkp4+fs2*rl(=KaU8
z0BaTPw0CEe1Mk<#4aEPRzd|-3=f(E0UfXc@o|;@m^yzDujhIiFk9z(fMnR8J6#7Yu
zLXV~Kn2h{*9DW05R1oDYe_xt%n`jF?bQxszWa*Z#cPo@cOIaLfd?nil+5U%SxAx6C
zY)jgM6f1p@gdIBTN7+Ec!G*pDPQC4iA%o$nTYQv-DTi(kB(<OWOUUOE7}Ei>-+}Zk
zv!VNrQL<r_!Gmn`juAeh)D=&l*7Z))3I5&|R0>tv_Fa%c;E~-!e_a>U`gCkf1|%TV
zV`%1!K4N{^M}va&Q7<Pf5VIo~C~$iiuhYVA_ncSxopNl~?Khyr8(0gmHaFUIJ?K}0
zmz0rFp9~lxY0pP29C#>XJr9jp;-!8+P14x6X|{Jr^gAkPcZ{Ol<z!iwT$!oj`3qcW
z6ETmu23>Y>!NB*Je{AfBG3fzCmeC9`Iv^;H1BU!0>-l|}j8i{5khSj5KUl{?+<h&$
zSwaD@XTKXqw#~M)`^oucVw>)FM>v`!HtyT6|IhFqysyNW#@WM}hU3q-)7vv`Z_?@0
zV|K3b=r&G9=i84alhHrZ^zYpk_&)=>Nw{r0nc|&HuD$~Rv(E}n0u4$7(<TJ~0E-j=
z02lz1Ju4c2oReK|+aMH%?<eg)Ans%QkyNP_r<z(t>)o`j+cm+*F$x$Da6UHezwcnz
zPNm#LwQRtAJnuR1vU)gaMGjz{G)+O1D5eA&VM^KT3i|qEGh0&PJa0;_OalcygQE}Y
z&!1LDWB0BG@B|5sD`@R~n=|G_4Vt^iw9w#4Wvu3Z9)InQxz++N`$bx@G>-3>=CYv!
zA-A_MOjSt%Peyba8b3fRDDE+SSIgFoG%aqyX>RvjI}=8?h}cRc{kexyqDB6^YmDVv
zg?XQnITz$)@lEl|Mw!0<N=Zv$jWd;xkj#dduhYM0_l%2)be;cAm}Jf@9Ar4a3o89s
zNXcS<Vx<>Y_CeU)1Pj|`^G=op{TshMGpv(P#Ft_m`$yNSQsN1RtcZoGpkI=Flg=qy
zuX+RhlyG$K+Qjqi3n&mCP$Z$`k1_g1@Njm?0!<wJx;17W7Cz&Y7$sc4AV_fG<pCZ+
zDO5@q==tA+Dz;IQO`7l9qyDD%_%}<o+`*%NQNLvAYm~*iR6@l&<u9GfXqlyPmIRHr
z!SCky738k=AyV9^;IzCvKY<fyaf~!dLkz+w6vUEfiB}c3RP$3`9|yfGi?WF=I_HfZ
zBk6PK0sjF20RR6000960l$h_1n=lZ>UnR-|H24n$PF$oG!+S(&q<*?qmnd!DzQ7KD
z0nf%#{jK_SVwUxI$6R~+alPKd`F=jNpFe-^KHv9dzq|c9f854@y;b|&ALGxLx9@+$
z^>(>_+~4=b7rQ^Tmr43{($2pa`E8_+lWz}?Q+v2T@oiBY06P7XGX6=>iYiw0T5)_T
z4&!S)-^FIU%k)-{Q{BYR>VPd3DcBT$1(gO)@&VYrF4JX{>E<$z)RImxS04ns^||xD
z^@2TD37n|XXH@CuDl6(vC%7h)1i^+{-@nF#;4)EVnNejay`&u%T2VDYu;C2{UJW>R
zc)%iwBFA&}-)F26U(y3QV6GCFPmJJ_8b1hjW;H@|e2y7q9_KPGIjs}S8!nW8-P&Uf
z8VG}0Ix4cxD6*CfXIlzZwiH~^Dd`25)cHZMTVFM*vdO5jksaq&3U*|KAlUN}7i>*z
z?g5KzcvPk-a;ASF?WAXP<08{2HEOU6uWKIX!X1wztmEDKfVkMKs4`+u-g_gQcR7Gt
zKJSa35B$P?B8p5-7LDSmgB2ftdT>J>3=5}@_u!uN19*L0|0C{fP9mAtY0`l!7l@qQ
z(L>JfJb<STmdS~wQ9Sjvq<(bMBF`%fk5v4~@*Du?IRNbZI)mfh<|LGn9PxZB0dMHk
z2XJ*^l4YFo0k}z-HYcbIvWNQAfm>x-(bM$kA$O{;2rHkO203YEBvTrHGG_-~D$|Od
z4_*&_twtC0Ze1ugK~8EJB#s(g!Z${n?;ZjAM$K8|9JynF-Xx372`__$QK5jGqK60Q
zH)B(-sNlM@SiedQ1e+6Kqj>&m;nspz>;rIhRR!N9m0SZCUZy@!aw5!1R4LPf>tYdk
ze6L>g>LW$;$;M`VO(DTic+y(on|_FA!8gwq8!URhxq>%EVj9?-1RKSZ3mX2Y0<YM!
z3+Fo*xKYK*`nFkjBK}Ox$uNtLD83tjt4LTl-wl>^$od2L#^O@zVf|CVKeO%;!4d?T
znmP@$vm`YE39BnXE&HJCqJxw1B^!Sv%e}0;FVLo}Rf<RuA`cX=MX8Duz2qx@fjncG
z60E8fr04(!Ewjsugg2De8lH6BfVL$V&ykd&yh+j4+J<6m_!cBHfpaJ)ZfY%=bw^#`
zhVf5U0FJ^i#u6+O%re{o{bnShBU11Y+3a-H0|duVfdpj@CIP`eUJF{9KL>w$W#W{j
zJv6S<G47Nt_#lq%y(vdy+qOYF>tne51iwA5pZltQsk&WpKv^O<<r<i+b#^0|^j7sw
zD>x*0B6Lfvn6Zy;-<uM|?IC-B)?1g}J5xo1f6ETrw;0$UJeP|OEl$eo?Cw5ap)3l?
z+z(?voabag$u*6_kM8xe`7R&la3PNw{v}3NK8bu1=V45v870w#cmia7?(@HqeFL*8
zCvyV@o6J~Mhm)ZyQUTSI^eQeIN(0j-1pokx6aWAi00000000000000001uNqD@6fo
zlY%Qn0jQJ6D?kBklL;(30%0GMULO{dTP!L9iy)JTASIKmEGhzxA(M(BDU$&$Dgu-v
zlaV7ElSnNP8=9Ir4OamG0B8aL01*HH00000000000002MlXERT0Zx;$Eh7doDgXcg
E0L}5-dH?_b

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index edbeefe..76b4a02 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -126,6 +126,7 @@ def test_markitdown_local() -> None:
         assert test_string in text_content
     # Check negations
     assert "Unnamed:" not in text_content
+    assert "NaN" not in text_content
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))

From 42027aac2d1f6172c9ba09ab6458be29cc12c2b8 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Mon, 16 Dec 2024 17:35:44 +0800
Subject: [PATCH 03/11] chore: type annot

---
 src/markitdown/_markitdown.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index daf1127..34d6551 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -510,7 +510,7 @@ class XlsxConverter(HtmlConverter):
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
-    def _clean_colname(self, colname: str | Any) -> str | Any:
+    def _clean_colname(self, colname: Any) -> Any:
         if isinstance(colname, str) and colname.startswith("Unnamed:"):
             return ""
         return colname

From c2aae4dddab8e9e04fae9141aa1905e1df22d91a Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Tue, 17 Dec 2024 14:03:39 +0800
Subject: [PATCH 04/11] chore: make cleaning optional

---
 src/markitdown/_markitdown.py | 24 ++++++++++++++++--------
 tests/test_markitdown.py      | 11 ++++++++---
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 376c75c..a72a963 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -527,7 +527,16 @@ class XlsxConverter(HtmlConverter):
             return ""
         return colname
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        return (
+            df.rename(columns=lambda col: self._clean_colname(col))
+            .dropna(how="all", axis=1)
+            .dropna(how="all", axis=0)
+        )
+
+    def convert(
+        self, local_path, beautify: bool = True, **kwargs
+    ) -> Union[None, DocumentConverterResult]:
         # Bail if not a XLSX
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".xlsx":
@@ -535,14 +544,13 @@ class XlsxConverter(HtmlConverter):
 
         sheets = pd.read_excel(local_path, sheet_name=None)
         md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            sheet = sheets[s]
-            sheet.columns = list(map(self._clean_colname, sheet.columns))
+        for name, sheet in sheets.items():
+            md_content += f"## {name}\n"
+            df = self._clean_dataframe(sheet) if beautify else sheet
             html_content = (
-                sheet.dropna(how="all", axis=1)
-                .dropna(how="all", axis=0)
-                .to_html(index=False, na_rep="")
+                df.to_html(index=False, na_rep="")
+                if beautify
+                else df.to_html(index=False)
             )
             md_content += self._convert(html_content).text_content.strip() + "\n\n"
 
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 2f061dc..bb666e9 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -42,6 +42,7 @@ XLSX_TEST_STRINGS = [
     "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 ]
 
+
 DOCX_TEST_STRINGS = [
     "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
     "49e168b7-d2ae-407f-a055-2167576f39a1",
@@ -139,14 +140,18 @@ def test_markitdown_local() -> None:
     markitdown = MarkItDown()
 
     # Test XLSX processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
+    # XlsxConverter has an additional kwarg `beautify`, which defaults to True
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False
+    )
+    result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
     # Check assertions
     for test_string in XLSX_TEST_STRINGS:
         text_content = result.text_content.replace("\\", "")
         assert test_string in text_content
     # Check negations
-    assert "Unnamed:" not in text_content
-    assert "NaN" not in text_content
+    assert "Unnamed:" not in result_cleaned.text_content
+    assert "NaN" not in result_cleaned.text_content
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))

From 5c60d8ca12ce03a89747f9be1cb124f2edfac052 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Tue, 17 Dec 2024 21:17:40 +0800
Subject: [PATCH 05/11] chore: finer flags, forward `na_rep`

---
 src/markitdown/_markitdown.py | 30 ++++++++++++++++--------------
 tests/test_markitdown.py      | 14 ++++++--------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index a72a963..67f31af 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -523,19 +523,18 @@ class XlsxConverter(HtmlConverter):
     """
 
     def _clean_colname(self, colname: Any) -> Any:
+        # Remove Pandas header placeholders
         if isinstance(colname, str) and colname.startswith("Unnamed:"):
             return ""
         return colname
 
-    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
-        return (
-            df.rename(columns=lambda col: self._clean_colname(col))
-            .dropna(how="all", axis=1)
-            .dropna(how="all", axis=0)
-        )
-
     def convert(
-        self, local_path, beautify: bool = True, **kwargs
+        self,
+        local_path,
+        na_rep: Any = "",
+        drop_empty_cols: bool = False,
+        drop_empty_rows: bool = False,
+        **kwargs,
     ) -> Union[None, DocumentConverterResult]:
         # Bail if not a XLSX
         extension = kwargs.get("file_extension", "")
@@ -546,12 +545,15 @@ class XlsxConverter(HtmlConverter):
         md_content = ""
         for name, sheet in sheets.items():
             md_content += f"## {name}\n"
-            df = self._clean_dataframe(sheet) if beautify else sheet
-            html_content = (
-                df.to_html(index=False, na_rep="")
-                if beautify
-                else df.to_html(index=False)
-            )
+            sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
+
+            if drop_empty_cols:
+                sheet = sheet.dropna(axis=1, how="all")
+
+            if drop_empty_rows:
+                sheet = sheet.dropna(axis=0, how="all")
+
+            html_content = sheet.to_html(index=False, na_rep=na_rep)
             md_content += self._convert(html_content).text_content.strip() + "\n\n"
 
         return DocumentConverterResult(
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index bb666e9..aeba9b4 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -140,18 +140,16 @@ def test_markitdown_local() -> None:
     markitdown = MarkItDown()
 
     # Test XLSX processing
-    # XlsxConverter has an additional kwarg `beautify`, which defaults to True
-    result = markitdown.convert(
-        os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False
-    )
-    result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
+    text_content = result.text_content.replace("\\", "")
+
     # Check assertions
     for test_string in XLSX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
         assert test_string in text_content
+
     # Check negations
-    assert "Unnamed:" not in result_cleaned.text_content
-    assert "NaN" not in result_cleaned.text_content
+    assert "Unnamed:" not in result.text_content
+    assert "NaN" not in result.text_content
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))

From 113f7748b79a0b0ac060d331dec727adf0c04e55 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Tue, 17 Dec 2024 21:38:40 +0800
Subject: [PATCH 06/11] chore: simplify xlsx tests

---
 tests/test_markitdown.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index aeba9b4..a7c3064 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -148,8 +148,8 @@ def test_markitdown_local() -> None:
         assert test_string in text_content
 
     # Check negations
-    assert "Unnamed:" not in result.text_content
-    assert "NaN" not in result.text_content
+    assert "Unnamed:" not in text_content
+    assert "NaN" not in text_content
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))

From 7b64e6ebfd370fc361ab0ace9a0b0dcfa89d8700 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Sun, 22 Dec 2024 21:22:41 +0800
Subject: [PATCH 07/11] chore: consider header for column-wise drop

---
 src/markitdown/_markitdown.py |   9 +++++++--
 tests/test_files/test.xlsx    | Bin 11739 -> 12088 bytes
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 67f31af..a576196 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -525,7 +525,7 @@ class XlsxConverter(HtmlConverter):
     def _clean_colname(self, colname: Any) -> Any:
         # Remove Pandas header placeholders
         if isinstance(colname, str) and colname.startswith("Unnamed:"):
-            return ""
+            return None
         return colname
 
     def convert(
@@ -548,11 +548,16 @@ class XlsxConverter(HtmlConverter):
             sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
 
             if drop_empty_cols:
-                sheet = sheet.dropna(axis=1, how="all")
+                # also consider headers to be part of the column
+                sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
 
             if drop_empty_rows:
                 sheet = sheet.dropna(axis=0, how="all")
 
+            # convert remaining NaN's to empty string
+            # because .to_html(na_rep="") does not apply to headers
+            sheet.columns = sheet.columns.fillna(na_rep)
+
             html_content = sheet.to_html(index=False, na_rep=na_rep)
             md_content += self._convert(html_content).text_content.strip() + "\n\n"
 
diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx
index 0dcbeb9b15bc026e88c46f179e8df38705cc70b1..9153d5292cbf75d168e4a753df4241f38913045c 100755
GIT binary patch
delta 3801
zcmZWs2{hE-+n=$9WE(?d-v^DYERlpTvM0tqwjsO7(D1dDEn-Hd5VDi3Wh;|4YbeUT
z@4M_F^ZLE-IdA{v-t*k^+;i_e_uS`xp6B^IW0tix?1sAJ6c7*<=mH1?;sc2|HJj#x
zL7)(-3U+=n!1SyBoD?Imto4QlG5gDtmIogmbcONI(3lGjQ@<R=e<^mb%n0+C%7_tj
z`2dq-eQ|&jtyns_JeDbf?w&O3d)e96Ux*JXfn5}~U|Wno{)4%8{03%!0gAmiT*!)3
zO5+YrQ!ZsT7Wxs!@k6>G(*nOzPF?KK%q!s0qf(At1nTtdpi)}FO5DJZFx8hv0dA>5
zKi@D(<-pRbgq;)9&K^qB*PnYi3ppso0@0zGg&~&?cyLDUmgxRFyMuIodSU|RtBiGr
z{$2X)yW}!)(#Amo=sN}{Nu}05jFFK>OUztx`wZ+)lb&0@TMOy*bwpRxNA(2s#|1R(
z9UJt-0mx@@K{q;x@5CgFAA8T}9YW7!Uik&SzGB`nT?X1Jn%h%du^U+{?A~2_Cn|O0
z8B3Y+_cw&kp_}%byg9*kikiC-bVr<zqb`w8rUz7-GK@zCjC-6iOV)JQg{e(7Ui{<l
zGA9zb_r5oO6vOMwY$7Cn8v2VHns({6)a24j?HxXuxwDk1X4gV)A#iigCDnFO)X4((
zT;d{hgUZv|^Z5#0RXe#0IR!2Z_5d$SEeP0=)cS9z8=F~1n+&H!(kDW~=7c{QfF-{%
zG3o5#{MNIUKL?1~n!ur+7EQvMzJ3_zMQ_Y+*NMSiaWjjKN7LWwW|clkI*N@hg`GI1
z=47n7w2b0@R4$Mo#=9L3{Ky2XN^^r=>HMAr-Fi#y>D0sDWX{Iw^^%xKH`&}}RSZ16
z%wCKxVT-LYHcL;-|H943<oVKNg8p`PZ&TuKohSwt=hR|Z1vdKROtrj*kl4HQPOLY~
z?PU=1sv%z*<*gnK$sSAURVBS7E7cm~3M9Q}nd@>1>nZ<{30Zv27)^?P^#hGJxhW4H
zg8MP*$E^TDDsSs{`OWyIKyC2-Q&V90$|kaCIie(6<-Q*%XP)p)RQv4!f~@I;*P?c5
zvCfR}&}cU*-jA!=9P1qa_1=ckwE<!ACo>C;@hDNVQz@`b(SrSZ$h8t<Omtk?fiKZS
zDs-H|t;MwaB2V|v(820K!1O-XK`oBg9^Fi`rfr*94~=Z(azQ4Qzal#1`vNJ&+_gt>
zR3vZp%WY!7!nG^nlsF@JbF%Lbd-8;p(%{N({YTm_*7q))MqWjtTEh7tob_pkJS}7;
zrj-MeuZ=E!vMrEjBt1T$9SKb*jI{)Lv}~w;c!F3S7?Rp_YOqnbgGxE>=nIGJ@A^E=
zezrI%wI89dOAM2RA!s{O@_|v|(c28;pvs_1zwVzOSrNEAQ3g38ZX?2-q+Vrt;>=Yo
zw4t{B;-+@<rPup8E_}1ox7Wi=yiQQ?(3xUOEF`aps+0jY+V`ie?IGneSJm>VuszAD
zu#*I%dQ@^tg!^s+)6MKbR)egc`}$)vXOqx-w5Fp6)`@D<F%#<61Yq#k&t30WkbjV_
zUt(UG)}A9LM8|%fO)&as{Q?#3IQN&e_vK>^cK$vESH}EL_Qif3{|qnt8h_oI;h1$=
zh`vKg_zx(lhry_S?QtR75jO{Q5#fnXigQkX{>UU>=T>3vgf!w20q=yAcFK=sg?s1e
z9bsGLmMi1vsEF;3S^&yR1DnM@FQ*0GV3Fg{1k(<TGdQFsR7t5Oe%S0&HE$29JxP_V
ziX^k^3z64%PxC?_;&;EcUZ2=X@Rrf>7_YG<WzMKzY<FejccWzK+cUiiYS5#Tjspel
zr&e<_8=_npUaIMLpOWrYeGagkCS6<>o~hn&j17>fJH7-0ags>npnngjXRfPewC4e}
zr)rHB1_tm3x~)=-ZCl|}d_5ES@9Lx7>!~$|-<I<5<__e$rzQ6s6--SUR`{ce2(vzu
ziH9M|GGi0yq1b?ngO~zdU1P@OXK+q4ag2(x^+Xu&&(gtn0flzA!mWJ*D}uk=e~8tn
zUeVED)1sAl2*8Cd$Gy&PJzSp6-+41|rQNXGCJo3bHClA7PnxNrhKOX?+rloK>~OTA
zmK}DcCW*xy5zi+-`7a;cjhydz^35=c1KFey5{G0~Q_Dbi3^^Wc&uBxF&<Q&Hik>?{
zFV{0s(6o^f8V5f&?h?=O@g70RYrY?aDUe*{*cnE6sB<DqAkCt)DLeNl#(N+t&*v<L
zAR;1fjy<==f7|Idy+^k?B?vSK0fE>-colIDARXVGFvbgcdidh_QgrY$c&DgC+vN|k
zu}U9{Wj%)2dD=zotQzayuYNq79_-Z~(Qp_Et;3QOx2u8_9%vrey_WM(!;9uqSfF4d
zo^;25{ADeTXO+FgOYhQAs)Xe*tUc$sPwfy;3f`umc_;2N7fg^{)F4m&*x}bdtL;&;
z2yg=%DRsSMWluFND&!KjUM^PVp>KP7p<dRfkYzkuzV`m?VntSM`;oQ44f*fyYbgur
zCiW-pvQO}3CnJsrKzSN*r-5S}-MUW#dGLyC^De*ies$k@^@fDd(M83}et4eP(<^OT
z0Q7BMW@A}VBBe=nrXz>Dy+hgV9TO9AZ9ppmUcUai{38vtx_hyqg!WkEjMP)Ef1AlR
z)@L1VT?UI0xl+jwT@4RG?C8~Jxi@fJ+*5CEYH(%g>t-|LRnF4B-Rb497dJh#j-yOf
zIQk&x%m%$Vm&`6SUe2rSJ+r{gtyUBsl996j&}R>8s9hH$dOTlEYnri@6swES0&+sS
zy5vP;<%_~o54&uY2;XFM=OmgmzBG9)Zd1-z|Kigh&!4XPq@)tY;^O^G7NLHzSrpQW
zbBSSNmA|(;XUNkVgoHY=Snl<5s@##8mja#r@6Vs}3=%2%<yX%!DaJ_KHgEVw9y1`Q
z8^+)GqfdIvuJ%$8B5v4wtOkJRaow5iC+o7@4%B>?Qgy>Ib?tfg!p}_4g6crh@5hH{
zpUh4s_J>wBR?mW9haXkDGI|ZBNC$+q-#d>UPcGT+6FquOmVIJhXL_HmE;oG8d*qe2
z{w&(-CO`S|tXAxH(#-_XmT0~N1+0eSrtD3krU<rs&-AdmxFx{9F%<_uN_NtX$qTlb
zhL!aP1an;%<rwba^rg^Kbl1BdF9mZ_SoShw1a#99^Wyw*wY65ZV}!XSxZZ)1>EVFx
zBdk?8KkAqLRW+TwEef;TMw{~NU@q=CBku|$vLOc21ZSQ*q*KR5XJjv&^(_s?=<rjp
z0CYrqCoyqeO~U}b911Av+Iea<QL=_W+y}EOv*h1K%Dq+8)05ex*nLX3aG$-LN-s^?
zLP@()U%N7e9*v|;a1^V}g<hdxyZ76u9C`Cvhw*}#><xm-l6}2qKJ;4d6YlYyi{K&H
zy`$wJ<R3=iG)E~!ECRlg?9Cb&wOwrA&ghAmMRB>Mutj#}D*(|ru`(oh`qR*#JEzlg
zbsH)*oSp^7*j7q|F>Bwo2=QXOEk#o-s0N}MZWgr=RbDCfD7)m!RhJW`ZEZ^C-c@!+
z;_;$X9)!i%oA!9K_gClfyj&|egu!()$a^!$&VdlaDV#p5g|KSb0aJg<Kih1|sUk=?
zWdETuvz)zK0SHkR#2i@GCbWvB=eLGB-MwdKj-OaPhIkpz@?MK6Cx`R<Ae7~YLyuq_
zM$z)!x8GbaRz@`0W-|$4h2H<p{U$(6SQN90AC%^|@eq1zlHNtxm(be6qm20Zki(T6
za}W7Bq;BD@Qd_arHWoblJ)BTNzP)TmOY{{~+zYgE2PXV+S8v#xCb@jrRzR9_W@ZOt
z{ic$RzBcD8ym)HzE0@ythq@dcF{s8W2&<fF(zJ$k;8=zf_%`8b9}qLc2(6F%e*00D
zW;;3sTf=WKO=3)x@6L6#qCwjo%*so$p|(-Es^MjVSL>X+0j_`XjY=`?y9WDuRV1cJ
zCIG=2X#V2dWs1&)@mw(Oa0isTgSrdK)$=OG!(BN+jh0NBkzy+xU|-yHW@SnejJrU*
z#lu*cLnDY6V$lG+K4Dx%If^X}w$i6HP3=(;V<-*Y1E+*jIm~Bkc`XXp1T=vb-^4(p
z|ES^nq!2)JRmP7zuBm_2*BF77%3606rStLP&T}48`dNU>!=(u^en=nmyuttr09$Ol
z;ZWXUvL}Tmh7^_DCo!E+R(T2B0#jV)smQ3R61)Zm+|0r)4%9|8sUs8*eAsX`_nor^
zS4v1BMwaN2VJF|-llgQ)>)Qy426%sd;k|HeMt%lJnZ}zPiUvnFAV%5Y6>~`mM6i%D
z;{M8lW{+E!IK4rAvDz5fvnA~=LmLqnw1=Mbbu2aRRik1uOU;Xvp8A|#pRY$E-C!+L
zZ6h-v!O|l$4*nA)m@Qzu&4k?YA0ek`Hw>q6q8A1?soKz4q^eQ}In5?N@o*bjVGrnn
zVi~m#n;<zAuBlnC_NA>t!l$_9`+`kB8%~reWXVC(ed&AdD>X3P4O)9p#`<15<H7{9
zuJd}%*kLuuXDpwgACuPn7!7;=%WDL60wq`Hj9l{l_NGlI@b$z5+AaAG->Z|EW`?E9
zNMm2Uob8;)f82@PidG#CPl1?kFAVrqV`XG=1ak#b84W_;_XVHHfA<1aiHZCsqy1Z%
z;U!>3;3<3vjQjtHe-;c1mciG+_&NWdY><OM9Oq#9+ZOP9FiFmTPqKekK_EC@SzHrb
zhz~y>7w{e8x4=bs8VMz`3?94<4-{`H0VC_+#YgiB;Y-fdE<SuLpV&=9T`-v>=x_1(
zpIH5S!8d{bMlK#K2_s7u#3u=G;<Y6iul$1<v-1q$LIwgc{XKfFH45SVBvoMgq9Bl?
jmxH0Nm-j<B%G>)N_#5g{QvI`+;k^Gkf3>vuKePV<xeo1>

delta 3414
zcmY*cXHXM}5)OnGgCPV65SmnhBnpTUr9(LBy*CvIMIyaQiNZku>7jR|83d`)q^WeJ
zHvy%dfC2_Av{0URZ|2>+{jsw<yR)-1yYqd!E%pmGT!uP8I(7g9fC&Hq2m*A6B8$>#
z0Dv?4IxZnvg6V=bP#PM>c|sJenb7;Cif#M#8qBn)a{B0ta}!UFo|^O2J|#cF4152j
z<t{GslhX(;^=1hvcq7h5*D11QD*UD?{qZfXt%~%Ap)Fjz<Or^I8cqfC`;SZY`(DYH
z5H+xprSm3b*6Tr4*Qe0|D!7<UTp5TA%ovrOX8p=d&@~cZCKB5q1>-*+g)m>|&%XyW
zNPP06^(5Rxrp}(D!jt;2Qd>2CWTsolR5|2FloK?kZ^l*abPD9BB`wHxpS9H&r)$nN
zsVMVA+W4WUi`wnnlq+`W#@SIu?!r5<zdX5)(<yf7EsxKb)a#ZXRabd7dtMdDpTZ{a
zus2n<1eSU?cb_LikSfA#N3^o1SpFTV4Sa(7uAAp8=I}BfDv-}V{H%k6d!2Eu2TW|-
zSsN`<mTpcS``Dm6?zcs&<XAH)SQ3`N%)s@NLwC&XSnT8}4<EXrxQY*I6&tJY{6+sE
z*Nj=FJr~6|MkF}h>qlo7QT>i$4DFw>=(9v4=$mh&j0s^m2w7x{zdmITNh*eMH0%IP
zfOL8T!A>MBqX?ns{L-eKAo)2OD}mQ}!=M@0?M>P-QPJ_kTEgs76ZzOr4<Mi2ziO<M
zbKs3dZWCRcA;e69S-l4xQsfp_wp`o03l0wiwHK8x?{8>*`-#_ao6l&3qjk0|%D0m&
z1kPG*K{pF8NH+tD%wp}_I$~IjHF^=1S9&5|*2=Ke7ZVWC-?_<APK~A<j&0<WC*CoL
zA)Am$=sRDj9(*4-BhF#2*4$G?<T;t)xN;TzQr$hycNrnM@h-|F#+e3cV()u8|9IZq
zq-S=tI@!-Ua={&D=??0Gp3$kVeG7;vBUK~pHH&+6?*L@yla=-g?_vjXp;ZQsUi_&^
zA|Wec1Y(G}Oz7q3UP&DM-AreWva}Ol8)53Zi?Q(S*4F*$&+)U{_k``9T!9z5I?9+W
znCqpFXuq(^{=mxI8^<Tsjcm1L&TNl1qh)CtpSB+fgfS#jRjae{WM|>=`^obe$Av?c
zZ~4t=vG|0!S<y`95r`(!$Hy6P>HJFhN~F64VXNnlLXJZsLJg!n2Y$X{sg)d`=W7>d
z_Cyv_-nKu8RWBAco0t^d<!LL80)Zn8@tYot1PHh`V^2RM(As)8JbzK;gz^RK1oW}@
zVA*;JYCf2P%ZM2)X<p?`Xh#Jqq$cGRP+0Qr6|m%W4-Y;g3fE)?FTLjG?u#nYv3o#Z
zdr-u-K&&c2eVX{?5w#pN_zsxh8AQdDpOfdR!{i{jsr%h?+I%ASqGyg%nxWoaf<CXw
zv!}HiZ@s3`O5cqwVs{>w?)#q0oY~SiP8<veUJ84Qs#}uMmW%fZ;+<^2_h+KGe=plJ
z|BmT^h%Pz$bd~fQVVq9TApCyvtf(|WiO*(D2_o$s=7Ojl*b|1`JcZp(w79G6G%3s_
z?iIq-Jo(1sAY!IBzfpH~XZG@k9UgGgjGT6$;$~)KW>0@4(?J-ci~aIpM<=*--`3@?
z3J*BeD(51Q<m^lAA(W&B^ouXr=||%<$_g~C->+!%G|VMSj^Q{t=|YnRA(4bdXo$-A
zsMfc`(xm(qsh|0mhC40^eGvzL2U@M2ZRh(}5!-6Gr**^u>)U~A9~zGW0+eqq=;9|&
z*O~g)rh3AtziOB%uc<ltZh6XiZkS^~`BufreX(2x#q|vYiwE(`awzq6kiwzN;Mu~1
z$7gayxqs$1ktF5^|N6@K8!}0yFc?UiTz>EPw{yGU<bos(A$9nEj}-Htl&H?g@ec3Q
zGsB@<Ares?O$&@tbugkH-jS>vOwCFAcB8g5bF4%}_J{O)XX?$Lt9-)#ixT4XymW2z
zxg#$x*dG{J99)Wdd9CdL!V#}oTYcr~jnNP8`UXy~RD{SF<Ga~!BhyT6cx7=)MLL;%
z{i?nQM`Yv+7DAhb)%sC>eAo4&9Jjc9BWU0{q9>er@W}JQlB``qQfD~N*d8}{FcwTI
zN<`O3xSiO;t!U|k3yN(?D{uROR#+u(UJH>5omM)NlWXrCWSZUIBu}#rgZoXDZk1iL
zxhm@0AjzB*_Vu_9DLbsZ?_+-c6$}8Jp92B^kO|xW*hG7YH<pPW08nNpb;GzxcP}y$
z^gI`&n4>AceF3^pjz_{0#=`nWjZ8BOwJ#Q|GQL>GC^2h!q+X;_O7eQSlXVQ=mgC1C
z+8dXevoKZ<$YZWnlU*}P(;l%lr8wJ+H&b%wQegruZY}i;k`L(&1ZE1AtM_MiLK&e<
zu=pBRPPIES>l%zX)SmsjW0Teu!QBRgW?W}rcdmDeIm%#CzPwfTCszCunt7qgxVwQP
zl7USh${5EOzZ6)1;F1GxUX~4fRa^*q9)g`R9`xcZPIEQt#3%T-n`a#e=K88aAQaxh
zRn)bSI}&aZdB2)2f>P%s*0+9A<`mZ)gzPp5Y+||B7%4LF-H0J&xN%dK<q?72JQNxr
z_j)y;)hAg`gHUn)f;-Vl_DgSG7-93_v}Tpp`#6j%@;EC!H&lP_;q@D1ETE9{C3#SL
zI6b`(AkHc6q~5&i0>67Ex;ECaYi8!#>Ko1z(FE~;-ud{`<LQ)~kf8dn6?iowlLF_(
z#!$l*nO5BljRLq;XI9?Q_89>@@PPBFq!{0;a(v?GGJO-WQY?6i!~EW^Yivf)wpWO(
za{Ss^sRg^1bY1Uq<n)GSU(4<8JHL1P{QMu4mUS<2UJ4pj6M<0~{GhLCjxy%Ce4xTk
z?rNHja1EhGzX!)ILCI&$#{mw?N&P?YkXeDucjy9-inWy=8x+Fu2GzVNcdKTy-~Vu{
zdfwwq6O+Hxd7hBDb@bgkWC?!f<9-lES*=USq5mpFIN#<e7vvRnuU1}ob%DJO=(=+m
zcdXUOYK9n}i#fgKActCOg=f}f^jCiCZ`0bi|Cle<6s4g$(Jj%=xL<=iX3h>zYpqCY
zjI#<e-QEc94<l@`Gn#|ipUOBrV=ftrtDls7LDqcgP&^e_<6YBYYh>m840S!TaZMX(
z%*Fxs#O&HWa*PgPt{3vbSe%FTNmlFR6wy1UcZWK#XQmz6*JZXsQ`Z_slUoIFw<;UW
zr9E4WU(7%p4b?Y?$6^O=sXJBkjNr&0(#lrj^tCElN(t`|;!fMAusBk2{W#x-spP&X
zlacqb@QV*x)*lratah>G>^DDkjrmRR?nn)kq{q2!4{hZvwEB$hFs;!k8zjdK*GrDm
ze(vD>-2RUB<6dy#>QT3WLX~C8UqR#0lYzUvM1EWGNbl9OJ&1x~D2qL1gHW11>@5<!
zBZh7v(-JOjeBFe0TKY;#^oP^&t8RHPb_Uy<wfL_J_iym!jmp=u_}1vrZ(vfM-J$CY
zdnD)ACfw4>^+jrNTdt*w!zED=q!9gmDni~Q4`E>URJ}zmI7%YwrAgtM_S402L)NC{
zG70^J4d9Gm=MD6Qy>NbUbOdi84D-azXoxfF#v(ylXq9KuV8z9hPjMAzhELQ2<@LxC
z4q5g1S-V`59&*q(XZIdX=ajVZK1W3K&w?u8?W?x{au&wBQNmyRJy$=^tBdqk=2@po
zSbHJiGkpge5{n9d$cA69v#Jrpa0FUcgr=}mTkZCKv}Q0-STcN_>rlqi!droe5HK6t
zU6TQ|ujb@qma4mZW2X1)u2HbZyu*7^#D3TkO?QH*ZXN|Ni7`D^Jbc<?w&iiAwJZQ=
z(1E3~HarzU(a=pKCV4eXh+d~52q0C_)S~UDQLlNaE#N^L<TayIzozqWBO8~w4>+F@
z>R)-8J?|n>9aqQ7pP5EzLWNk^Yp})N={lT{gvSbw*|4fAc9Bgc2K40C^Q_r43q*mR
zS!5GE*Jw&wal8K5$b60i{tzN^?Xj=;We+4e+y8d{TF#z91~BoBSE4AvQY;JCLVx^A
z1$()48Cbu#RS#rJET#huA#`aY2<`AjSpvjwXg~rSQe{_R@=@Am*PizyqX~0gxm$$u
zM6cHoD14!+@Kt4a@w}b4iW>^^8Ie`(_Yj%&(|p<<iRlXxY1Bbw*YPw{9?-0Q%TiKX
zs|Hi@$qId4IO2}=Pjx)B;GG5)?(*5+=RCRE5E13HPV`mI!#SxAUQBqCwEyeL(G3@Z
zpaJ)p=pQD9Sfy;2m5cePO+Q@);KY@K&r6zJG!2^KN3<9IeK<%uNF$mNQa+OZzsGGC
z5>6vXnn4Qj{EuFN0081Q3jHfnBsLV9=U<uu0J#1Q{<a__8<Yl(E~yx$K^x0KqJeXe
zPEnFHdL(i5EgChFBU+UPPbxzj(s+>e(YHt*R}i%R!lYUe9#YvADDU6nw)p)FxzYjv
htp8WHScEiwMUnREWfJr%0s;X*01dx;gGv6K^dE_0Nv8k+


From ba3011721c5b94311caa3018df706f976968dc53 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Sun, 22 Dec 2024 21:39:12 +0800
Subject: [PATCH 08/11] chore: update tests

---
 tests/test_markitdown.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index b1a0f08..1eefba1 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -54,6 +54,8 @@ XLSX_TEST_STRINGS = [
     "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 ]
 
+XLSX_TEST_EXCLUDES = ["Unnamed:", "NaN"]
+
 
 DOCX_TEST_STRINGS = [
     "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
@@ -175,11 +177,7 @@ def test_markitdown_local() -> None:
 
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
-    validate_strings(result, XLSX_TEST_STRINGS)
-
-    # Check negations
-    assert "Unnamed:" not in text_content
-    assert "NaN" not in text_content
+    validate_strings(result, XLSX_TEST_STRINGS, XLSX_TEST_EXCLUDES)
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))

From 767b8d611dee85b3a55e3d9ff3e204030b42acc8 Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Thu, 3 Apr 2025 10:05:33 +0800
Subject: [PATCH 09/11] chore: update to head

---
 .../markitdown/converters/_xlsx_converter.py  |  105 +-
 packages/markitdown/tests/_test_vectors.py    |    4 +-
 src/markitdown/_markitdown.py                 | 1549 -----------------
 tests/test_markitdown.py                      |  312 ----
 4 files changed, 72 insertions(+), 1898 deletions(-)
 delete mode 100644 src/markitdown/_markitdown.py
 delete mode 100644 tests/test_markitdown.py

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 28f73a0..e6632f6 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -33,15 +33,68 @@ ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 
 
-class XlsxConverter(DocumentConverter):
-    """
-    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
-    """
+class ExcelConverterBase(DocumentConverter):
+    """Base class for Excel-like converters"""
 
     def __init__(self):
         super().__init__()
         self._html_converter = HtmlConverter()
 
+    def _clean_colname(self, colname: Any) -> Any:
+        # Remove Pandas header placeholders
+        if isinstance(colname, str) and colname.startswith("Unnamed:"):
+            return None
+        return colname
+
+    def _convert_excel(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        engine: str,
+        na_rep: Any = "",
+        remove_header_placeholders: bool = True,
+        drop_empty_cols: bool = False,
+        drop_empty_rows: bool = False,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        sheets = pd.read_excel(file_stream, sheet_name=None, engine=engine)
+        md_content = ""
+        for name, sheet in sheets.items():
+            md_content += f"## {name}\n"
+
+            if remove_header_placeholders:
+                sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
+
+            if drop_empty_cols:
+                # Also consider headers to be part of the column
+                sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
+
+            if drop_empty_rows:
+                sheet = sheet.dropna(axis=0, how="all")
+
+            # Coerce any cell that evaluates to `pd.isna(c) == True` to `na_rep`
+            # More reliable than using `.to_html(na_rep=...)`: https://github.com/pandas-dev/pandas/issues/11953
+            # Because the latter does not replace NaT's
+            with pd.option_context("future.no_silent_downcasting", True):
+                sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False)
+                sheet.columns = sheet.columns.fillna(na_rep)
+
+            html_content = sheet.to_html(index=False, na_rep=na_rep)
+            md_content += (
+                self._html_converter.convert_string(
+                    html_content, **kwargs
+                ).markdown.strip()
+                + "\n\n"
+            )
+
+        return DocumentConverterResult(markdown=md_content.strip())
+
+
+class XlsxConverter(ExcelConverterBase):
+    """
+    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
     def accepts(
         self,
         file_stream: BinaryIO,
@@ -80,30 +133,19 @@ class XlsxConverter(DocumentConverter):
                 _xlsx_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
-
-        return DocumentConverterResult(markdown=md_content.strip())
+        return self._convert_excel(
+            file_stream=file_stream,
+            stream_info=stream_info,
+            engine="openpyxl",
+            **kwargs,
+        )
 
 
-class XlsConverter(DocumentConverter):
+class XlsConverter(ExcelConverterBase):
     """
     Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
     def accepts(
         self,
         file_stream: BinaryIO,
@@ -142,16 +184,9 @@ class XlsConverter(DocumentConverter):
                 _xls_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
-
-        return DocumentConverterResult(markdown=md_content.strip())
+        return self._convert_excel(
+            file_stream=file_stream,
+            stream_info=stream_info,
+            engine="xlrd",
+            **kwargs,
+        )
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 4a7b54a..e2187a5 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -41,7 +41,7 @@ GENERAL_TEST_VECTORS = [
             "6ff4173b-42a5-4784-9b19-f49caff4d93d",
             "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
         ],
-        must_not_include=[],
+        must_not_include=["Unnamed:", "NaN"],
     ),
     FileTestVector(
         filename="test.xls",
@@ -53,7 +53,7 @@ GENERAL_TEST_VECTORS = [
             "6ff4173b-42a5-4784-9b19-f49caff4d93d",
             "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
         ],
-        must_not_include=[],
+        must_not_include=["Unnamed:", "NaN"],
     ),
     FileTestVector(
         filename="test.pptx",
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
deleted file mode 100644
index 9ca5d67..0000000
--- a/src/markitdown/_markitdown.py
+++ /dev/null
@@ -1,1549 +0,0 @@
-# type: ignore
-import base64
-import binascii
-import copy
-import html
-import json
-import mimetypes
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-import traceback
-import zipfile
-from xml.dom import minidom
-from typing import Any, Dict, List, Optional, Union
-from pathlib import Path
-from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
-from warnings import warn, resetwarnings, catch_warnings
-
-import mammoth
-import markdownify
-import pandas as pd
-import pdfminer
-import pdfminer.high_level
-import pptx
-
-# File-format detection
-import puremagic
-import requests
-from bs4 import BeautifulSoup
-from charset_normalizer import from_path
-
-# Optional Transcription support
-try:
-    # Using warnings' catch_warnings to catch
-    # pydub's warning of ffmpeg or avconv missing
-    with catch_warnings(record=True) as w:
-        import pydub
-
-        if w:
-            raise ModuleNotFoundError
-    import speech_recognition as sr
-
-    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-finally:
-    resetwarnings()
-
-# Optional YouTube transcription support
-try:
-    from youtube_transcript_api import YouTubeTranscriptApi
-
-    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-
-
-class _CustomMarkdownify(markdownify.MarkdownConverter):
-    """
-    A custom version of markdownify's MarkdownConverter. Changes include:
-
-    - Altering the default heading style to use '#', '##', etc.
-    - Removing javascript hyperlinks.
-    - Truncating images with large data:uri sources.
-    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
-    """
-
-    def __init__(self, **options: Any):
-        options["heading_style"] = options.get("heading_style", markdownify.ATX)
-        # Explicitly cast options to the expected type if necessary
-        super().__init__(**options)
-
-    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
-        """Same as usual, but be sure to start with a new line"""
-        if not convert_as_inline:
-            if not re.search(r"^\n", text):
-                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-
-        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-
-    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
-        """Same as usual converter, but removes Javascript links and escapes URIs."""
-        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
-        if not text:
-            return ""
-        href = el.get("href")
-        title = el.get("title")
-
-        # Escape URIs and skip non-http or file schemes
-        if href:
-            try:
-                parsed_url = urlparse(href)  # type: ignore
-                if parsed_url.scheme and parsed_url.scheme.lower() not in [
-                    "http",
-                    "https",
-                    "file",
-                ]:  # type: ignore
-                    return "%s%s%s" % (prefix, text, suffix)
-                href = urlunparse(
-                    parsed_url._replace(path=quote(unquote(parsed_url.path)))
-                )  # type: ignore
-            except ValueError:  # It's not clear if this ever gets thrown
-                return "%s%s%s" % (prefix, text, suffix)
-
-        # For the replacement see #29: text nodes underscores are escaped
-        if (
-            self.options["autolinks"]
-            and text.replace(r"\_", "_") == href
-            and not title
-            and not self.options["default_title"]
-        ):
-            # Shortcut syntax
-            return "<%s>" % href
-        if self.options["default_title"] and not title:
-            title = href
-        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        return (
-            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
-            if href
-            else text
-        )
-
-    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
-        """Same as usual converter, but removes data URIs"""
-
-        alt = el.attrs.get("alt", None) or ""
-        src = el.attrs.get("src", None) or ""
-        title = el.attrs.get("title", None) or ""
-        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        if (
-            convert_as_inline
-            and el.parent.name not in self.options["keep_inline_images_in"]
-        ):
-            return alt
-
-        # Remove dataURIs
-        if src.startswith("data:"):
-            src = src.split(",")[0] + "..."
-
-        return "![%s](%s%s)" % (alt, src, title_part)
-
-    def convert_soup(self, soup: Any) -> str:
-        return super().convert_soup(soup)  # type: ignore
-
-
-class DocumentConverterResult:
-    """The result of converting a document to text."""
-
-    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
-        self.title: Union[str, None] = title
-        self.text_content: str = text_content
-
-
-class DocumentConverter:
-    """Abstract superclass of all DocumentConverters."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        raise NotImplementedError()
-
-
-class PlainTextConverter(DocumentConverter):
-    """Anything with content type text/plain"""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Guess the content type from any file extension that might be around
-        content_type, _ = mimetypes.guess_type(
-            "__placeholder" + kwargs.get("file_extension", "")
-        )
-
-        # Only accept text files
-        if content_type is None:
-            return None
-        elif "text/" not in content_type.lower():
-            return None
-
-        text_content = str(from_path(local_path).best())
-        return DocumentConverterResult(
-            title=None,
-            text_content=text_content,
-        )
-
-
-class HtmlConverter(DocumentConverter):
-    """Anything with content type text/html"""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not html
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
-
-        return result
-
-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts and HTML string."""
-
-        # Parse the string
-        soup = BeautifulSoup(html_content, "html.parser")
-
-        # Remove javascript and style blocks
-        for script in soup(["script", "style"]):
-            script.extract()
-
-        # Print only the main content
-        body_elm = soup.find("body")
-        webpage_text = ""
-        if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
-        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
-
-        assert isinstance(webpage_text, str)
-
-        return DocumentConverterResult(
-            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
-        )
-
-
-class RSSConverter(DocumentConverter):
-    """Convert RSS / Atom type to markdown"""
-
-    def convert(
-        self, local_path: str, **kwargs
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not RSS type
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".xml", ".rss", ".atom"]:
-            return None
-        try:
-            doc = minidom.parse(local_path)
-        except BaseException as _:
-            return None
-        result = None
-        if doc.getElementsByTagName("rss"):
-            # A RSS feed must have a root element of <rss>
-            result = self._parse_rss_type(doc)
-        elif doc.getElementsByTagName("feed"):
-            root = doc.getElementsByTagName("feed")[0]
-            if root.getElementsByTagName("entry"):
-                # An Atom feed must have a root element of <feed> and at least one <entry>
-                result = self._parse_atom_type(doc)
-            else:
-                return None
-        else:
-            # not rss or atom
-            return None
-
-        return result
-
-    def _parse_atom_type(
-        self, doc: minidom.Document
-    ) -> Union[None, DocumentConverterResult]:
-        """Parse the type of an Atom feed.
-
-        Returns None if the feed type is not recognized or something goes wrong.
-        """
-        try:
-            root = doc.getElementsByTagName("feed")[0]
-            title = self._get_data_by_tag_name(root, "title")
-            subtitle = self._get_data_by_tag_name(root, "subtitle")
-            entries = root.getElementsByTagName("entry")
-            md_text = f"# {title}\n"
-            if subtitle:
-                md_text += f"{subtitle}\n"
-            for entry in entries:
-                entry_title = self._get_data_by_tag_name(entry, "title")
-                entry_summary = self._get_data_by_tag_name(entry, "summary")
-                entry_updated = self._get_data_by_tag_name(entry, "updated")
-                entry_content = self._get_data_by_tag_name(entry, "content")
-
-                if entry_title:
-                    md_text += f"\n## {entry_title}\n"
-                if entry_updated:
-                    md_text += f"Updated on: {entry_updated}\n"
-                if entry_summary:
-                    md_text += self._parse_content(entry_summary)
-                if entry_content:
-                    md_text += self._parse_content(entry_content)
-
-            return DocumentConverterResult(
-                title=title,
-                text_content=md_text,
-            )
-        except BaseException as _:
-            return None
-
-    def _parse_rss_type(
-        self, doc: minidom.Document
-    ) -> Union[None, DocumentConverterResult]:
-        """Parse the type of an RSS feed.
-
-        Returns None if the feed type is not recognized or something goes wrong.
-        """
-        try:
-            root = doc.getElementsByTagName("rss")[0]
-            channel = root.getElementsByTagName("channel")
-            if not channel:
-                return None
-            channel = channel[0]
-            channel_title = self._get_data_by_tag_name(channel, "title")
-            channel_description = self._get_data_by_tag_name(channel, "description")
-            items = channel.getElementsByTagName("item")
-            if channel_title:
-                md_text = f"# {channel_title}\n"
-            if channel_description:
-                md_text += f"{channel_description}\n"
-            if not items:
-                items = []
-            for item in items:
-                title = self._get_data_by_tag_name(item, "title")
-                description = self._get_data_by_tag_name(item, "description")
-                pubDate = self._get_data_by_tag_name(item, "pubDate")
-                content = self._get_data_by_tag_name(item, "content:encoded")
-
-                if title:
-                    md_text += f"\n## {title}\n"
-                if pubDate:
-                    md_text += f"Published on: {pubDate}\n"
-                if description:
-                    md_text += self._parse_content(description)
-                if content:
-                    md_text += self._parse_content(content)
-
-            return DocumentConverterResult(
-                title=channel_title,
-                text_content=md_text,
-            )
-        except BaseException as _:
-            print(traceback.format_exc())
-            return None
-
-    def _parse_content(self, content: str) -> str:
-        """Parse the content of an RSS feed item"""
-        try:
-            # using bs4 because many RSS feeds have HTML-styled content
-            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify().convert_soup(soup)
-        except BaseException as _:
-            return content
-
-    def _get_data_by_tag_name(
-        self, element: minidom.Element, tag_name: str
-    ) -> Union[str, None]:
-        """Get data from first child element with the given tag name.
-        Returns None when no such element is found.
-        """
-        nodes = element.getElementsByTagName(tag_name)
-        if not nodes:
-            return None
-        fc = nodes[0].firstChild
-        if fc:
-            return fc.data
-        return None
-
-
-class WikipediaConverter(DocumentConverter):
-    """Handle Wikipedia pages separately, focusing only on the main document content."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not Wikipedia
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
-            return None
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Remove javascript and style blocks
-        for script in soup(["script", "style"]):
-            script.extract()
-
-        # Print only the main content
-        body_elm = soup.find("div", {"id": "mw-content-text"})
-        title_elm = soup.find("span", {"class": "mw-page-title-main"})
-
-        webpage_text = ""
-        main_title = None if soup.title is None else soup.title.string
-
-        if body_elm:
-            # What's the title
-            if title_elm and len(title_elm) > 0:
-                main_title = title_elm.string  # type: ignore
-                assert isinstance(main_title, str)
-
-            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
-                body_elm
-            )
-        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
-
-        return DocumentConverterResult(
-            title=main_title,
-            text_content=webpage_text,
-        )
-
-
-class YouTubeConverter(DocumentConverter):
-    """Handle YouTube specially, focusing on the video title, description, and transcript."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not YouTube
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not url.startswith("https://www.youtube.com/watch?"):
-            return None
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Read the meta tags
-        assert soup.title is not None and soup.title.string is not None
-        metadata: Dict[str, str] = {"title": soup.title.string}
-        for meta in soup(["meta"]):
-            for a in meta.attrs:
-                if a in ["itemprop", "property", "name"]:
-                    metadata[meta[a]] = meta.get("content", "")
-                    break
-
-        # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
-        try:
-            for script in soup(["script"]):
-                content = script.text
-                if "ytInitialData" in content:
-                    lines = re.split(r"\r?\n", content)
-                    obj_start = lines[0].find("{")
-                    obj_end = lines[0].rfind("}")
-                    if obj_start >= 0 and obj_end >= 0:
-                        data = json.loads(lines[0][obj_start : obj_end + 1])
-                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")  # type: ignore
-                        if attrdesc:
-                            metadata["description"] = str(attrdesc["content"])
-                    break
-        except Exception:
-            pass
-
-        # Start preparing the page
-        webpage_text = "# YouTube\n"
-
-        title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
-        assert isinstance(title, str)
-
-        if title:
-            webpage_text += f"\n## {title}\n"
-
-        stats = ""
-        views = self._get(metadata, ["interactionCount"])  # type: ignore
-        if views:
-            stats += f"- **Views:** {views}\n"
-
-        keywords = self._get(metadata, ["keywords"])  # type: ignore
-        if keywords:
-            stats += f"- **Keywords:** {keywords}\n"
-
-        runtime = self._get(metadata, ["duration"])  # type: ignore
-        if runtime:
-            stats += f"- **Runtime:** {runtime}\n"
-
-        if len(stats) > 0:
-            webpage_text += f"\n### Video Metadata\n{stats}\n"
-
-        description = self._get(metadata, ["description", "og:description"])  # type: ignore
-        if description:
-            webpage_text += f"\n### Description\n{description}\n"
-
-        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
-            transcript_text = ""
-            parsed_url = urlparse(url)  # type: ignore
-            params = parse_qs(parsed_url.query)  # type: ignore
-            if "v" in params:
-                assert isinstance(params["v"][0], str)
-                video_id = str(params["v"][0])
-                try:
-                    youtube_transcript_languages = kwargs.get(
-                        "youtube_transcript_languages", ("en",)
-                    )
-                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
-                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
-                    # Alternative formatting:
-                    # formatter = TextFormatter()
-                    # formatter.format_transcript(transcript)
-                except Exception:
-                    pass
-            if transcript_text:
-                webpage_text += f"\n### Transcript\n{transcript_text}\n"
-
-        title = title if title else soup.title.string
-        assert isinstance(title, str)
-
-        return DocumentConverterResult(
-            title=title,
-            text_content=webpage_text,
-        )
-
-    def _get(
-        self,
-        metadata: Dict[str, str],
-        keys: List[str],
-        default: Union[str, None] = None,
-    ) -> Union[str, None]:
-        for k in keys:
-            if k in metadata:
-                return metadata[k]
-        return default
-
-    def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json type
-        if isinstance(json, list):
-            for elm in json:
-                ret = self._findKey(elm, key)
-                if ret is not None:
-                    return ret
-        elif isinstance(json, dict):
-            for k in json:
-                if k == key:
-                    return json[k]
-                else:
-                    ret = self._findKey(json[k], key)
-                    if ret is not None:
-                        return ret
-        return None
-
-
-class IpynbConverter(DocumentConverter):
-    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not ipynb
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".ipynb":
-            return None
-
-        # Parse and convert the notebook
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            notebook_content = json.load(fh)
-            result = self._convert(notebook_content)
-
-        return result
-
-    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts notebook JSON content to Markdown."""
-        try:
-            md_output = []
-            title = None
-
-            for cell in notebook_content.get("cells", []):
-                cell_type = cell.get("cell_type", "")
-                source_lines = cell.get("source", [])
-
-                if cell_type == "markdown":
-                    md_output.append("".join(source_lines))
-
-                    # Extract the first # heading as title if not already found
-                    if title is None:
-                        for line in source_lines:
-                            if line.startswith("# "):
-                                title = line.lstrip("# ").strip()
-                                break
-
-                elif cell_type == "code":
-                    # Code cells are wrapped in Markdown code blocks
-                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
-                elif cell_type == "raw":
-                    md_output.append(f"```\n{''.join(source_lines)}\n```")
-
-            md_text = "\n\n".join(md_output)
-
-            # Check for title in notebook metadata
-            title = notebook_content.get("metadata", {}).get("title", title)
-
-            return DocumentConverterResult(
-                title=title,
-                text_content=md_text,
-            )
-
-        except Exception as e:
-            raise FileConversionException(
-                f"Error converting .ipynb file: {str(e)}"
-            ) from e
-
-
-class BingSerpConverter(DocumentConverter):
-    """
-    Handle Bing results pages (only the organic search results).
-    NOTE: It is better to use the Bing API
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a Bing SERP
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
-            return None
-
-        # Parse the query parameters
-        parsed_params = parse_qs(urlparse(url).query)
-        query = parsed_params.get("q", [""])[0]
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Clean up some formatting
-        for tptt in soup.find_all(class_="tptt"):
-            if hasattr(tptt, "string") and tptt.string:
-                tptt.string += " "
-        for slug in soup.find_all(class_="algoSlug_icon"):
-            slug.extract()
-
-        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify()
-        results = list()
-        for result in soup.find_all(class_="b_algo"):
-            # Rewrite redirect urls
-            for a in result.find_all("a", href=True):
-                parsed_href = urlparse(a["href"])
-                qs = parse_qs(parsed_href.query)
-
-                # The destination is contained in the u parameter,
-                # but appears to be base64 encoded, with some prefix
-                if "u" in qs:
-                    u = (
-                        qs["u"][0][2:].strip() + "=="
-                    )  # Python 3 doesn't care about extra padding
-
-                    try:
-                        # RFC 4648 / Base64URL" variant, which uses "-" and "_"
-                        a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
-                    except UnicodeDecodeError:
-                        pass
-                    except binascii.Error:
-                        pass
-
-            # Convert to markdown
-            md_result = _markdownify.convert_soup(result).strip()
-            lines = [line.strip() for line in re.split(r"\n+", md_result)]
-            results.append("\n".join([line for line in lines if len(line) > 0]))
-
-        webpage_text = (
-            f"## A Bing search for '{query}' found the following results:\n\n"
-            + "\n\n".join(results)
-        )
-
-        return DocumentConverterResult(
-            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
-        )
-
-
-class PdfConverter(DocumentConverter):
-    """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a PDF
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".pdf":
-            return None
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
-        )
-
-
-class DocxConverter(HtmlConverter):
-    """
-    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a DOCX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".docx":
-            return None
-
-        result = None
-        with open(local_path, "rb") as docx_file:
-            style_map = kwargs.get("style_map", None)
-
-            result = mammoth.convert_to_html(docx_file, style_map=style_map)
-            html_content = result.value
-            result = self._convert(html_content)
-
-        return result
-
-
-class XlsxConverter(HtmlConverter):
-    """
-    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
-    """
-
-    def _clean_colname(self, colname: Any) -> Any:
-        # Remove Pandas header placeholders
-        if isinstance(colname, str) and colname.startswith("Unnamed:"):
-            return None
-        return colname
-
-    def convert(
-        self,
-        local_path,
-        na_rep: Any = "",
-        drop_empty_cols: bool = False,
-        drop_empty_rows: bool = False,
-        **kwargs,
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".xlsx":
-            return None
-
-        sheets = pd.read_excel(local_path, sheet_name=None)
-        md_content = ""
-        for name, sheet in sheets.items():
-            md_content += f"## {name}\n"
-            sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
-
-            if drop_empty_cols:
-                # also consider headers to be part of the column
-                sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
-
-            if drop_empty_rows:
-                sheet = sheet.dropna(axis=0, how="all")
-
-            # convert remaining NaN's to empty string
-            # because .to_html(na_rep="") does not apply to headers
-            sheet.columns = sheet.columns.fillna(na_rep)
-
-            html_content = sheet.to_html(index=False, na_rep=na_rep)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-
-class PptxConverter(HtmlConverter):
-    """
-    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a PPTX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".pptx":
-            return None
-
-        md_content = ""
-
-        presentation = pptx.Presentation(local_path)
-        slide_num = 0
-        for slide in presentation.slides:
-            slide_num += 1
-
-            md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
-
-            title = slide.shapes.title
-            for shape in slide.shapes:
-                # Pictures
-                if self._is_picture(shape):
-                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-                    alt_text = ""
-                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
-                    except Exception:
-                        pass
-
-                    # A placeholder name
-                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                    md_content += (
-                        "\n!["
-                        + (alt_text if alt_text else shape.name)
-                        + "]("
-                        + filename
-                        + ")\n"
-                    )
-
-                # Tables
-                if self._is_table(shape):
-                    html_table = "<html><body><table>"
-                    first_row = True
-                    for row in shape.table.rows:
-                        html_table += "<tr>"
-                        for cell in row.cells:
-                            if first_row:
-                                html_table += "<th>" + html.escape(cell.text) + "</th>"
-                            else:
-                                html_table += "<td>" + html.escape(cell.text) + "</td>"
-                        html_table += "</tr>"
-                        first_row = False
-                    html_table += "</table></body></html>"
-                    md_content += (
-                        "\n" + self._convert(html_table).text_content.strip() + "\n"
-                    )
-
-                # Charts
-                if shape.has_chart:
-                    md_content += self._convert_chart_to_markdown(shape.chart)
-
-                # Text areas
-                elif shape.has_text_frame:
-                    if shape == title:
-                        md_content += "# " + shape.text.lstrip() + "\n"
-                    else:
-                        md_content += shape.text + "\n"
-
-            md_content = md_content.strip()
-
-            if slide.has_notes_slide:
-                md_content += "\n\n### Notes:\n"
-                notes_frame = slide.notes_slide.notes_text_frame
-                if notes_frame is not None:
-                    md_content += notes_frame.text
-                md_content = md_content.strip()
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-    def _is_picture(self, shape):
-        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
-            return True
-        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
-            if hasattr(shape, "image"):
-                return True
-        return False
-
-    def _is_table(self, shape):
-        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
-            return True
-        return False
-
-    def _convert_chart_to_markdown(self, chart):
-        md = "\n\n### Chart"
-        if chart.has_title:
-            md += f": {chart.chart_title.text_frame.text}"
-        md += "\n\n"
-        data = []
-        category_names = [c.label for c in chart.plots[0].categories]
-        series_names = [s.name for s in chart.series]
-        data.append(["Category"] + series_names)
-
-        for idx, category in enumerate(category_names):
-            row = [category]
-            for series in chart.series:
-                row.append(series.values[idx])
-            data.append(row)
-
-        markdown_table = []
-        for row in data:
-            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
-        header = markdown_table[0]
-        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
-        return md + "\n".join([header, separator] + markdown_table[1:])
-
-
-class MediaConverter(DocumentConverter):
-    """
-    Abstract class for multi-modal media (e.g., images and audio)
-    """
-
-    def _get_metadata(self, local_path):
-        exiftool = shutil.which("exiftool")
-        if not exiftool:
-            return None
-        else:
-            try:
-                result = subprocess.run(
-                    [exiftool, "-json", local_path],
-                    capture_output=True,
-                    text=True,
-                ).stdout
-                return json.loads(result)[0]
-            except Exception:
-                return None
-
-
-class WavConverter(MediaConverter):
-    """
-    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a WAV
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".wav":
-            return None
-
-        md_content = ""
-
-        # Add metadata
-        metadata = self._get_metadata(local_path)
-        if metadata:
-            for f in [
-                "Title",
-                "Artist",
-                "Author",
-                "Band",
-                "Album",
-                "Genre",
-                "Track",
-                "DateTimeOriginal",
-                "CreateDate",
-                "Duration",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Transcribe
-        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
-            try:
-                transcript = self._transcribe_audio(local_path)
-                md_content += "\n\n### Audio Transcript:\n" + (
-                    "[No speech detected]" if transcript == "" else transcript
-                )
-            except Exception:
-                md_content += (
-                    "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
-                )
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-    def _transcribe_audio(self, local_path) -> str:
-        recognizer = sr.Recognizer()
-        with sr.AudioFile(local_path) as source:
-            audio = recognizer.record(source)
-            return recognizer.recognize_google(audio).strip()
-
-
-class Mp3Converter(WavConverter):
-    """
-    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a MP3
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".mp3":
-            return None
-
-        md_content = ""
-
-        # Add metadata
-        metadata = self._get_metadata(local_path)
-        if metadata:
-            for f in [
-                "Title",
-                "Artist",
-                "Author",
-                "Band",
-                "Album",
-                "Genre",
-                "Track",
-                "DateTimeOriginal",
-                "CreateDate",
-                "Duration",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Transcribe
-        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
-            handle, temp_path = tempfile.mkstemp(suffix=".wav")
-            os.close(handle)
-            try:
-                sound = pydub.AudioSegment.from_mp3(local_path)
-                sound.export(temp_path, format="wav")
-
-                _args = dict()
-                _args.update(kwargs)
-                _args["file_extension"] = ".wav"
-
-                try:
-                    transcript = super()._transcribe_audio(temp_path).strip()
-                    md_content += "\n\n### Audio Transcript:\n" + (
-                        "[No speech detected]" if transcript == "" else transcript
-                    )
-                except Exception:
-                    md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
-
-            finally:
-                os.unlink(temp_path)
-
-        # Return the result
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-
-class ImageConverter(MediaConverter):
-    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not an image
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
-            return None
-
-        md_content = ""
-
-        # Add metadata
-        metadata = self._get_metadata(local_path)
-        if metadata:
-            for f in [
-                "ImageSize",
-                "Title",
-                "Caption",
-                "Description",
-                "Keywords",
-                "Artist",
-                "Author",
-                "DateTimeOriginal",
-                "CreateDate",
-                "GPSPosition",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Try describing the image with GPTV
-        llm_client = kwargs.get("llm_client")
-        llm_model = kwargs.get("llm_model")
-        if llm_client is not None and llm_model is not None:
-            md_content += (
-                "\n# Description:\n"
-                + self._get_llm_description(
-                    local_path,
-                    extension,
-                    llm_client,
-                    llm_model,
-                    prompt=kwargs.get("llm_prompt"),
-                ).strip()
-                + "\n"
-            )
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content,
-        )
-
-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
-        if prompt is None or prompt.strip() == "":
-            prompt = "Write a detailed caption for this image."
-
-        data_uri = ""
-        with open(local_path, "rb") as image_file:
-            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
-            if content_type is None:
-                content_type = "image/jpeg"
-            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": data_uri,
-                        },
-                    },
-                ],
-            }
-        ]
-
-        response = client.chat.completions.create(model=model, messages=messages)
-        return response.choices[0].message.content
-
-
-class ZipConverter(DocumentConverter):
-    """Converts ZIP files to markdown by extracting and converting all contained files.
-
-    The converter extracts the ZIP contents to a temporary directory, processes each file
-    using appropriate converters based on file extensions, and then combines the results
-    into a single markdown document. The temporary directory is cleaned up after processing.
-
-    Example output format:
-    ```markdown
-    Content from the zip file `example.zip`:
-
-    ## File: docs/readme.txt
-
-    This is the content of readme.txt
-    Multiple lines are preserved
-
-    ## File: images/example.jpg
-
-    ImageSize: 1920x1080
-    DateTimeOriginal: 2024-02-15 14:30:00
-    Description: A beautiful landscape photo
-
-    ## File: data/report.xlsx
-
-    ## Sheet1
-    | Column1 | Column2 | Column3 |
-    |---------|---------|---------|
-    | data1   | data2   | data3   |
-    | data4   | data5   | data6   |
-    ```
-
-    Key features:
-    - Maintains original file structure in headings
-    - Processes nested files recursively
-    - Uses appropriate converters for each file type
-    - Preserves formatting of converted content
-    - Cleans up temporary files after processing
-    """
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not a ZIP
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".zip":
-            return None
-
-        # Get parent converters list if available
-        parent_converters = kwargs.get("_parent_converters", [])
-        if not parent_converters:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
-            )
-
-        extracted_zip_folder_name = (
-            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
-        )
-        extraction_dir = os.path.normpath(
-            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
-        )
-        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
-
-        try:
-            # Extract the zip file safely
-            with zipfile.ZipFile(local_path, "r") as zipObj:
-                # Safeguard against path traversal
-                for member in zipObj.namelist():
-                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
-                    if (
-                        not os.path.commonprefix([extraction_dir, member_path])
-                        == extraction_dir
-                    ):
-                        raise ValueError(
-                            f"Path traversal detected in zip file: {member}"
-                        )
-
-                # Extract all files safely
-                zipObj.extractall(path=extraction_dir)
-
-            # Process each extracted file
-            for root, dirs, files in os.walk(extraction_dir):
-                for name in files:
-                    file_path = os.path.join(root, name)
-                    relative_path = os.path.relpath(file_path, extraction_dir)
-
-                    # Get file extension
-                    _, file_extension = os.path.splitext(name)
-
-                    # Update kwargs for the file
-                    file_kwargs = kwargs.copy()
-                    file_kwargs["file_extension"] = file_extension
-                    file_kwargs["_parent_converters"] = parent_converters
-
-                    # Try converting the file using available converters
-                    for converter in parent_converters:
-                        # Skip the zip converter to avoid infinite recursion
-                        if isinstance(converter, ZipConverter):
-                            continue
-
-                        result = converter.convert(file_path, **file_kwargs)
-                        if result is not None:
-                            md_content += f"\n## File: {relative_path}\n\n"
-                            md_content += result.text_content + "\n\n"
-                            break
-
-            # Clean up extracted files if specified
-            if kwargs.get("cleanup_extracted", True):
-                shutil.rmtree(extraction_dir)
-
-            return DocumentConverterResult(title=None, text_content=md_content.strip())
-
-        except zipfile.BadZipFile:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
-            )
-        except ValueError as ve:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
-            )
-        except Exception as e:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
-            )
-
-
-class FileConversionException(BaseException):
-    pass
-
-
-class UnsupportedFormatException(BaseException):
-    pass
-
-
-class MarkItDown:
-    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
-    This reader will convert common file-types or webpages to Markdown."""
-
-    def __init__(
-        self,
-        requests_session: Optional[requests.Session] = None,
-        llm_client: Optional[Any] = None,
-        llm_model: Optional[str] = None,
-        style_map: Optional[str] = None,
-        # Deprecated
-        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[str] = None,
-    ):
-        if requests_session is None:
-            self._requests_session = requests.Session()
-        else:
-            self._requests_session = requests_session
-
-        # Handle deprecation notices
-        #############################
-        if mlm_client is not None:
-            if llm_client is None:
-                warn(
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
-                    DeprecationWarning,
-                )
-                llm_client = mlm_client
-                mlm_client = None
-            else:
-                raise ValueError(
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
-                )
-
-        if mlm_model is not None:
-            if llm_model is None:
-                warn(
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
-                    DeprecationWarning,
-                )
-                llm_model = mlm_model
-                mlm_model = None
-            else:
-                raise ValueError(
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
-                )
-        #############################
-
-        self._llm_client = llm_client
-        self._llm_model = llm_model
-        self._style_map = style_map
-
-        self._page_converters: List[DocumentConverter] = []
-
-        # Register converters for successful browsing operations
-        # Later registrations are tried first / take higher priority than earlier registrations
-        # To this end, the most specific converters should appear below the most generic converters
-        self.register_page_converter(PlainTextConverter())
-        self.register_page_converter(HtmlConverter())
-        self.register_page_converter(RSSConverter())
-        self.register_page_converter(WikipediaConverter())
-        self.register_page_converter(YouTubeConverter())
-        self.register_page_converter(BingSerpConverter())
-        self.register_page_converter(DocxConverter())
-        self.register_page_converter(XlsxConverter())
-        self.register_page_converter(PptxConverter())
-        self.register_page_converter(WavConverter())
-        self.register_page_converter(Mp3Converter())
-        self.register_page_converter(ImageConverter())
-        self.register_page_converter(IpynbConverter())
-        self.register_page_converter(PdfConverter())
-        self.register_page_converter(ZipConverter())
-
-    def convert(
-        self, source: Union[str, requests.Response, Path], **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        """
-        Args:
-            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
-            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
-        """
-
-        # Local path or url
-        if isinstance(source, str):
-            if (
-                source.startswith("http://")
-                or source.startswith("https://")
-                or source.startswith("file://")
-            ):
-                return self.convert_url(source, **kwargs)
-            else:
-                return self.convert_local(source, **kwargs)
-        # Request response
-        elif isinstance(source, requests.Response):
-            return self.convert_response(source, **kwargs)
-        elif isinstance(source, Path):
-            return self.convert_local(source, **kwargs)
-
-    def convert_local(
-        self, path: Union[str, Path], **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        if isinstance(path, Path):
-            path = str(path)
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
-
-        # Get extension alternatives from the path and puremagic
-        base, ext = os.path.splitext(path)
-        self._append_ext(extensions, ext)
-
-        for g in self._guess_ext_magic(path):
-            self._append_ext(extensions, g)
-
-        # Convert
-        return self._convert(path, extensions, **kwargs)
-
-    # TODO what should stream's type be?
-    def convert_stream(
-        self, stream: Any, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
-
-        # Save the file locally to a temporary file. It will be deleted before this method exits
-        handle, temp_path = tempfile.mkstemp()
-        fh = os.fdopen(handle, "wb")
-        result = None
-        try:
-            # Write to the temporary file
-            content = stream.read()
-            if isinstance(content, str):
-                fh.write(content.encode("utf-8"))
-            else:
-                fh.write(content)
-            fh.close()
-
-            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
-                self._append_ext(extensions, g)
-
-            # Convert
-            result = self._convert(temp_path, extensions, **kwargs)
-        # Clean up
-        finally:
-            try:
-                fh.close()
-            except Exception:
-                pass
-            os.unlink(temp_path)
-
-        return result
-
-    def convert_url(
-        self, url: str, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: fix kwargs type
-        # Send a HTTP request to the URL
-        response = self._requests_session.get(url, stream=True)
-        response.raise_for_status()
-        return self.convert_response(response, **kwargs)
-
-    def convert_response(
-        self, response: requests.Response, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO fix kwargs type
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
-
-        # Guess from the mimetype
-        content_type = response.headers.get("content-type", "").split(";")[0]
-        self._append_ext(extensions, mimetypes.guess_extension(content_type))
-
-        # Read the content disposition if there is one
-        content_disposition = response.headers.get("content-disposition", "")
-        m = re.search(r"filename=([^;]+)", content_disposition)
-        if m:
-            base, ext = os.path.splitext(m.group(1).strip("\"'"))
-            self._append_ext(extensions, ext)
-
-        # Read from the extension from the path
-        base, ext = os.path.splitext(urlparse(response.url).path)
-        self._append_ext(extensions, ext)
-
-        # Save the file locally to a temporary file. It will be deleted before this method exits
-        handle, temp_path = tempfile.mkstemp()
-        fh = os.fdopen(handle, "wb")
-        result = None
-        try:
-            # Download the file
-            for chunk in response.iter_content(chunk_size=512):
-                fh.write(chunk)
-            fh.close()
-
-            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
-                self._append_ext(extensions, g)
-
-            # Convert
-            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
-        # Clean up
-        finally:
-            try:
-                fh.close()
-            except Exception:
-                pass
-            os.unlink(temp_path)
-
-        return result
-
-    def _convert(
-        self, local_path: str, extensions: List[Union[str, None]], **kwargs
-    ) -> DocumentConverterResult:
-        error_trace = ""
-        for ext in extensions + [None]:  # Try last with no extension
-            for converter in self._page_converters:
-                _kwargs = copy.deepcopy(kwargs)
-
-                # Overwrite file_extension appropriately
-                if ext is None:
-                    if "file_extension" in _kwargs:
-                        del _kwargs["file_extension"]
-                else:
-                    _kwargs.update({"file_extension": ext})
-
-                # Copy any additional global options
-                if "llm_client" not in _kwargs and self._llm_client is not None:
-                    _kwargs["llm_client"] = self._llm_client
-
-                if "llm_model" not in _kwargs and self._llm_model is not None:
-                    _kwargs["llm_model"] = self._llm_model
-
-                # Add the list of converters for nested processing
-                _kwargs["_parent_converters"] = self._page_converters
-
-                if "style_map" not in _kwargs and self._style_map is not None:
-                    _kwargs["style_map"] = self._style_map
-
-                # If we hit an error log it and keep trying
-                try:
-                    res = converter.convert(local_path, **_kwargs)
-                except Exception:
-                    error_trace = ("\n\n" + traceback.format_exc()).strip()
-
-                if res is not None:
-                    # Normalize the content
-                    res.text_content = "\n".join(
-                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
-                    )
-                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
-
-                    # Todo
-                    return res
-
-        # If we got this far without success, report any exceptions
-        if len(error_trace) > 0:
-            raise FileConversionException(
-                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
-            )
-
-        # Nothing can handle it!
-        raise UnsupportedFormatException(
-            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
-        )
-
-    def _append_ext(self, extensions, ext):
-        """Append a unique non-None, non-empty extension to a list of extensions."""
-        if ext is None:
-            return
-        ext = ext.strip()
-        if ext == "":
-            return
-        # if ext not in extensions:
-        extensions.append(ext)
-
-    def _guess_ext_magic(self, path):
-        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
-        # Use puremagic to guess
-        try:
-            guesses = puremagic.magic_file(path)
-            extensions = list()
-            for g in guesses:
-                ext = g.extension.strip()
-                if len(ext) > 0:
-                    if not ext.startswith("."):
-                        ext = "." + ext
-                    if ext not in extensions:
-                        extensions.append(ext)
-            return extensions
-        except FileNotFoundError:
-            pass
-        except IsADirectoryError:
-            pass
-        except PermissionError:
-            pass
-        return []
-
-    def register_page_converter(self, converter: DocumentConverter) -> None:
-        """Register a page text converter."""
-        self._page_converters.insert(0, converter)
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
deleted file mode 100644
index 1eefba1..0000000
--- a/tests/test_markitdown.py
+++ /dev/null
@@ -1,312 +0,0 @@
-#!/usr/bin/env python3 -m pytest
-import io
-import os
-import shutil
-
-import pytest
-import requests
-
-from warnings import catch_warnings, resetwarnings
-
-from markitdown import MarkItDown
-
-skip_remote = (
-    True if os.environ.get("GITHUB_ACTIONS") else False
-)  # Don't run these tests in CI
-
-
-# Don't run the llm tests without a key and the client library
-skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
-try:
-    import openai
-except ModuleNotFoundError:
-    skip_llm = True
-
-# Skip exiftool tests if not installed
-skip_exiftool = shutil.which("exiftool") is None
-
-TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
-
-JPG_TEST_EXIFTOOL = {
-    "Author": "AutoGen Authors",
-    "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-    "Description": "AutoGen enables diverse LLM-based applications",
-    "ImageSize": "1615x1967",
-    "DateTimeOriginal": "2024:03:14 22:10:00",
-}
-
-PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
-PDF_TEST_STRINGS = [
-    "While there is contemporaneous exploration of multi-agent approaches"
-]
-
-YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg"
-YOUTUBE_TEST_STRINGS = [
-    "## AutoGen FULL Tutorial with Python (Step-By-Step)",
-    "This is an intermediate tutorial for installing and using AutoGen locally",
-    "PT15M4S",
-    "the model we're going to be using today is GPT 3.5 turbo",  # From the transcript
-]
-
-XLSX_TEST_STRINGS = [
-    "## 09060124-b5e7-4717-9d07-3c046eb",
-    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
-    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
-]
-
-XLSX_TEST_EXCLUDES = ["Unnamed:", "NaN"]
-
-
-DOCX_TEST_STRINGS = [
-    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
-    "49e168b7-d2ae-407f-a055-2167576f39a1",
-    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
-    "# Abstract",
-    "# Introduction",
-    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-]
-
-DOCX_COMMENT_TEST_STRINGS = [
-    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
-    "49e168b7-d2ae-407f-a055-2167576f39a1",
-    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
-    "# Abstract",
-    "# Introduction",
-    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-    "This is a test comment. 12df-321a",
-    "Yet another comment in the doc. 55yiyi-asd09",
-]
-
-PPTX_TEST_STRINGS = [
-    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
-    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
-    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
-    "1b92870d-e3b5-4e65-8153-919f4ff45592",
-    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
-    "2003",  # chart value
-]
-
-BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
-BLOG_TEST_STRINGS = [
-    "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
-    "an example where high cost can easily prevent a generic complex",
-]
-
-
-RSS_TEST_STRINGS = [
-    "The Official Microsoft Blog",
-    "In the case of AI, it is absolutely true that the industry is moving incredibly fast",
-]
-
-
-WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
-WIKIPEDIA_TEST_STRINGS = [
-    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
-    'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
-]
-WIKIPEDIA_TEST_EXCLUDES = [
-    "You are encouraged to create an account and log in",
-    "154 languages",
-    "move to sidebar",
-]
-
-SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia"
-SERP_TEST_STRINGS = [
-    "](https://en.wikipedia.org/wiki/Microsoft",
-    "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
-    "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
-]
-SERP_TEST_EXCLUDES = [
-    "https://www.bing.com/ck/a?!&&p=",
-    "data:image/svg+xml,%3Csvg%20width%3D",
-]
-
-CSV_CP932_TEST_STRINGS = [
-    "名前,年齢,住所",
-    "佐藤太郎,30,東京",
-    "三木英子,25,大阪",
-    "髙橋淳,35,名古屋",
-]
-
-LLM_TEST_STRINGS = [
-    "5bda1dd6",
-]
-
-
-# --- Helper Functions ---
-def validate_strings(result, expected_strings, exclude_strings=None):
-    """Validate presence or absence of specific strings."""
-    text_content = result.text_content.replace("\\", "")
-    for string in expected_strings:
-        assert string in text_content
-    if exclude_strings:
-        for string in exclude_strings:
-            assert string not in text_content
-
-
-@pytest.mark.skipif(
-    skip_remote,
-    reason="do not run tests that query external urls",
-)
-def test_markitdown_remote() -> None:
-    markitdown = MarkItDown()
-
-    # By URL
-    result = markitdown.convert(PDF_TEST_URL)
-    for test_string in PDF_TEST_STRINGS:
-        assert test_string in result.text_content
-
-    # By stream
-    response = requests.get(PDF_TEST_URL)
-    result = markitdown.convert_stream(
-        io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL
-    )
-    for test_string in PDF_TEST_STRINGS:
-        assert test_string in result.text_content
-
-    # Youtube
-    # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
-    # result = markitdown.convert(YOUTUBE_TEST_URL)
-    # for test_string in YOUTUBE_TEST_STRINGS:
-    #     assert test_string in result.text_content
-
-
-def test_markitdown_local() -> None:
-    markitdown = MarkItDown()
-
-    # Test XLSX processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
-    validate_strings(result, XLSX_TEST_STRINGS, XLSX_TEST_EXCLUDES)
-
-    # Test DOCX processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
-    validate_strings(result, DOCX_TEST_STRINGS)
-
-    # Test DOCX processing, with comments
-    result = markitdown.convert(
-        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
-        style_map="comment-reference => ",
-    )
-    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
-
-    # Test DOCX processing, with comments and setting style_map on init
-    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
-    result = markitdown_with_style_map.convert(
-        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
-    )
-    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
-
-    # Test PPTX processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
-    validate_strings(result, PPTX_TEST_STRINGS)
-
-    # Test HTML processing
-    result = markitdown.convert(
-        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
-    )
-    validate_strings(result, BLOG_TEST_STRINGS)
-
-    # Test ZIP file processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
-    validate_strings(result, XLSX_TEST_STRINGS)
-
-    # Test Wikipedia processing
-    result = markitdown.convert(
-        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
-    )
-    text_content = result.text_content.replace("\\", "")
-    validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
-
-    # Test Bing processing
-    result = markitdown.convert(
-        os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
-    )
-    text_content = result.text_content.replace("\\", "")
-    validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
-
-    # Test RSS processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
-    text_content = result.text_content.replace("\\", "")
-    for test_string in RSS_TEST_STRINGS:
-        assert test_string in text_content
-
-    ## Test non-UTF-8 encoding
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
-    validate_strings(result, CSV_CP932_TEST_STRINGS)
-
-
-@pytest.mark.skipif(
-    skip_exiftool,
-    reason="do not run if exiftool is not installed",
-)
-def test_markitdown_exiftool() -> None:
-    markitdown = MarkItDown()
-
-    # Test JPG metadata processing
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
-    for key in JPG_TEST_EXIFTOOL:
-        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
-        assert target in result.text_content
-
-
-def test_markitdown_deprecation() -> None:
-    try:
-        with catch_warnings(record=True) as w:
-            test_client = object()
-            markitdown = MarkItDown(mlm_client=test_client)
-            assert len(w) == 1
-            assert w[0].category is DeprecationWarning
-            assert markitdown._llm_client == test_client
-    finally:
-        resetwarnings()
-
-    try:
-        with catch_warnings(record=True) as w:
-            markitdown = MarkItDown(mlm_model="gpt-4o")
-            assert len(w) == 1
-            assert w[0].category is DeprecationWarning
-            assert markitdown._llm_model == "gpt-4o"
-    finally:
-        resetwarnings()
-
-    try:
-        test_client = object()
-        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
-        assert False
-    except ValueError:
-        pass
-
-    try:
-        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
-        assert False
-    except ValueError:
-        pass
-
-
-@pytest.mark.skipif(
-    skip_llm,
-    reason="do not run llm tests without a key",
-)
-def test_markitdown_llm() -> None:
-    client = openai.OpenAI()
-    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
-
-    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
-
-    for test_string in LLM_TEST_STRINGS:
-        assert test_string in result.text_content
-
-    # This is not super precise. It would also accept "red square", "blue circle",
-    # "the square is not blue", etc. But it's sufficient for this test.
-    for test_string in ["red", "circle", "blue", "square"]:
-        assert test_string in result.text_content.lower()
-
-
-if __name__ == "__main__":
-    """Runs this file's tests from the command line."""
-    test_markitdown_remote()
-    test_markitdown_local()
-    test_markitdown_exiftool()
-    test_markitdown_deprecation()
-    test_markitdown_llm()

From fae0faf8ddb7f60bfd92501c9f771f256161697c Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Thu, 3 Apr 2025 11:45:58 +0800
Subject: [PATCH 10/11] chore: infer dtypes for columns as well, remove
 unneeded na_rep

---
 .../src/markitdown/converters/_xlsx_converter.py     | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index e6632f6..a020b11 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -77,9 +77,9 @@ class ExcelConverterBase(DocumentConverter):
             # Because the latter does not replace NaT's
             with pd.option_context("future.no_silent_downcasting", True):
                 sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False)
-                sheet.columns = sheet.columns.fillna(na_rep)
+                sheet.columns = sheet.columns.fillna(na_rep).infer_objects(copy=False)
 
-            html_content = sheet.to_html(index=False, na_rep=na_rep)
+            html_content = sheet.to_html(index=False)
             md_content += (
                 self._html_converter.convert_string(
                     html_content, **kwargs
@@ -127,9 +127,7 @@ class XlsxConverter(ExcelConverterBase):
                     extension=".xlsx",
                     feature="xlsx",
                 )
-            ) from _xlsx_dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _xlsx_dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
                 _xlsx_dependency_exc_info[2]
             )
 
@@ -178,9 +176,7 @@ class XlsConverter(ExcelConverterBase):
                     extension=".xls",
                     feature="xls",
                 )
-            ) from _xls_dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _xls_dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
                 _xls_dependency_exc_info[2]
             )
 

From b1748afa4dbec1ff078534ffd4bd86eeb275054e Mon Sep 17 00:00:00 2001
From: Hew Li Yang <hewliyang@u.nus.edu>
Date: Thu, 3 Apr 2025 11:48:59 +0800
Subject: [PATCH 11/11] chore: pre-commit

---
 .../src/markitdown/converters/_xlsx_converter.py          | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index a020b11..0ddff7c 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -127,7 +127,9 @@ class XlsxConverter(ExcelConverterBase):
                     extension=".xlsx",
                     feature="xlsx",
                 )
-            ) from _xlsx_dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
+            ) from _xlsx_dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _xlsx_dependency_exc_info[2]
             )
 
@@ -176,7 +178,9 @@ class XlsConverter(ExcelConverterBase):
                     extension=".xls",
                     feature="xls",
                 )
-            ) from _xls_dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
+            ) from _xls_dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _xls_dependency_exc_info[2]
             )