From 7b64e6ebfd370fc361ab0ace9a0b0dcfa89d8700 Mon Sep 17 00:00:00 2001 From: Hew Li Yang Date: Sun, 22 Dec 2024 21:22:41 +0800 Subject: [PATCH] chore: consider header for column-wise drop --- src/markitdown/_markitdown.py | 9 +++++++-- tests/test_files/test.xlsx | Bin 11739 -> 12088 bytes 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 67f31af..a576196 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -525,7 +525,7 @@ class XlsxConverter(HtmlConverter): def _clean_colname(self, colname: Any) -> Any: # Remove Pandas header placeholders if isinstance(colname, str) and colname.startswith("Unnamed:"): - return "" + return None return colname def convert( @@ -548,11 +548,16 @@ class XlsxConverter(HtmlConverter): sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) if drop_empty_cols: - sheet = sheet.dropna(axis=1, how="all") + # also consider headers to be part of the column + sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()] if drop_empty_rows: sheet = sheet.dropna(axis=0, how="all") + # convert remaining NaN's to empty string + # because .to_html(na_rep="") does not apply to headers + sheet.columns = sheet.columns.fillna(na_rep) + html_content = sheet.to_html(index=False, na_rep=na_rep) md_content += self._convert(html_content).text_content.strip() + "\n\n" diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 0dcbeb9b15bc026e88c46f179e8df38705cc70b1..9153d5292cbf75d168e4a753df4241f38913045c 100755 GIT binary patch delta 3801 zcmZWs2{hE-+n=$9WE(?d-v^DYERlpTvM0tqwjsO7(D1dDEn-Hd5VDi3Wh;|4YbeUT z@4M_F^ZLE-IdA{v-t*k^+;i_e_uS`xp6B^IW0tix?1sAJ6c7*<=mH1?;sc2|HJj#x zL7)(-3U+=n!1SyBoD?Imto4QlG5gDtmIogmbcONI(3lGjQ@G(*nOzPF?KK%q!s0qf(At1nTtdpi)}FO5DJZFx8hv0dA>5 zKi@D(<-pRbgq;)9&K^qB*PnYi3ppso0@0zGg&~&?cyLDUmgxRFyMuIodSU|RtBiGr z{$2X)yW}!)(#Amo=sN}{Nu}05jFFK>OUztx`wZ+)lb&0@TMOy*bwpRxNA(2s#|1R( z9UJt-0mx@@K{q;x@5CgFAA8T}9YW7!Uik&SzGB`nT?X1Jn%h%du^U+{?A~2_Cn|O0 z8B3Y+_cw&kp_}%byg9*kikiC-bVrHMAr-Fi#y>D0sDWX{Iw^^%xKH`&}}RSZ16 z%wCKxVT-LYHcL;-|H9431>nZ<{30Zv27)^?P^#hGJxhW4H zg8MP*$E^TDDsSs{`OWyIKyC2-Q&V90$|kaCIie(6<-Q*%XP)p)RQv4!f~@I;*P?c5 zvCfR}&}cU*-jA!=9P1qa_1=ckwEC;@hDNVQz@`b(SrSZ$h8t zR3vZp%WY!7!nG^nlsF@JbF%Lbd-8;p(%{N({YTm_*7q))MqWjtTEh7tob_pkJS}7; zrj-MeuZ=E!vMrEjBt1T$9SKb*jI{)Lv}~w;c!F3S7?Rp_YOqnbgGxE>=nIGJ@A^E= zezrI%wI89dOAM2RA!s{O@_|v|(c28;pvs_1zwVzOSrNEAQ3g38ZX?2-q+Vrt;>=Yo zw4t{B;-+@VuszAD zu#*I%dQ@^tg!^s+)6MKbR)egc`}$)vXOqx-w5Fp6)`@D^cK$vESH}EL_Qif3{|qnt8h_oI;h1$= zh`vKg_zx(lhry_S?QtR75jO{Q5#fnXigQkX{>UU>=T>3vgf!w20q=yAcFK=sg?s1e z9bsGLmMi1vsEF;3S^&yR1DnM@FQ*0GV3Fg{1k(7_YGo~hn&j17>fJH7-0ags>npnngjXRfPewC4e} zr)rHB1_tm3x~)=-ZCl|}d_5ES@9Lx7>!~$|-~OTA zmK}DcCW*xy5zi+-`7a;cjhydz^35=c1KFey5{G0~Q_Dbi3^^Wc&uBxF&?{ zFV{0s(6o^f8V5f&?h?=O@g70RYrY?aDUe*{*cnE6sB-colIDARXVGFvbgcdidh_QgrY$c&DgC+vN|k zu}U9{Wj%)2dD=zotQzayuYNq79_-Z~(Qp_Et;3QOx2u8_9%vrey_WM(!;9uqSfF4d zo^;25{ADeTXO+FgOYhQAs)Xe*tUc$sPwfy;3f`umc_;2N7fg^{)F4m&*x}bdtL;&; z2yg=%DRsSMWluFND&!KjUM^PVp>KP7pDy+hgV9TO9AZ9ppmUcUai{38vtx_hyqg!WkEjMP)Ef1AlR z)@L1VT?UI0xl+jwT@4RG?C8~Jxi@fJ+*5CEYH(%g>t-|LRnF4B-Rb497dJh#j-yOf zIQk&x%m%$Vm&`6SUe2rSJ+r{gtyUBsl996j&}R>8s9hH$dOTlEYnri@6swES0&+sS zy5vP;<%_~o54&uY2;XFM=OmgmzBG9)Zd1-z|Kigh&!4XPq@)tY;^O^G7NLHzSrpQW zbBSSNmA|(;XUNkVgoHY=Snl<5s@##8mja#r@6Vs}3=%2%?Qgy>Ib?tfg!p}_4g6crh@5hH{ zpUh4s_J>wBR?mW9haXkDGI|ZBNC$+q-#d>UPcGT+6FquOmVIJhXL_HmE;oG8d*qe2 z{w&(-CO`S|tXAxH(#-_XmT0~N1+0eSrtD3krUEVFx zBdk?8KkAqLRW+TwEef;TMw{~NU@q=CBku|$vLOc21ZSQ*q*KR5XJjv&^(_s?=Mutj#}D*(|ru`(oh`qR*#JEzlg zbsH)*oSp^7*j7q|F>Bwo2=QXOEk#o-s0N}MZWgr=RbDCfD7)m!RhJW`ZEZ^C-c@!+ z;_;$X9)!i%oA!9K_gClfyj&|egu!()$a^!$&VdlaDV#p5g|KSb0aJgX+0j_`XjY=`?y9WDuRV1cJ zCIG=2X#V2dWs1&)@mw(Oa0isTgSrdK)$=OG!(BN+jh0NBkzy+xU|-yHW@SnejJrU* z#lu*cLnDY6V$lG+K4Dx%If^X}w$i6HP3=(;V<-*Y1E+*jIm~Bkc`XXp1T=vb-^4(p z|ES^nq!2)JRmP7zuBm_2*BF77%3606rStLP&T}48`dNU>!=(u^en=nmyuttr09$Ol z;ZWXUvL}Tmh7^_DCo!E+R(T2B0#jV)smQ3R61)Zm+|0r)4%9|8sUs8*eAsX`_nor^ zS4v1BMwaN2VJF|-llgQ)>)Qy426%sd;k|HeMt%lJnZ}zPiUvnFAV%5Y6>~`mM6i%D z;{M8lW{+E!IK4rAvDz5fvnA~=LmLqnw1=Mbbu2aRRik1uOU;Xvp8A|#pRY$E-C!+L zZ6h-v!O|l$4*nA)m@Qzu&4k?YA0ek`Hw>q6q8A1?soKz4q^eQ}In5?N@o*bjVGrnn zVi~m#n;t!l$_9`+`kB8%~reWXVC(ed&AdD>X3P4O)9p#`<15Z|EW`?E9 zNMm2Uob8;)f82@PidG#CPl1?kFAVrqV`XG=1ak#b84W_;_XVHHfA<1aiHZCsqy1Z% z;U!>3;3<3vjQjtHe-;c1mciG+_&NWdY>7w{e8x4=bs8VMz`3?94<4-{`H0VC_+#YgiB;Y-fdE=x_1( zpIH5S!8d{bMlK#K2_s7u#3u=G;)N_#5g{QvI`+;k^Gkf3>vuKePV delta 3414 zcmY*cXHXM}5)OnGgCPV65SmnhBnpTUr9(LBy*CvIMIyaQiNZku>7jR|83d`)q^WeJ zHvy%dfC2_Av{0URZ|2>+{jsw# z0Dv?4IxZnvg6V=bP#PM>c|sJenb7;Cif#M#8qBn)a{B0ta}!UFo|^O2J|#cF4152j z-*+g)m>|&%XyW zNPP06^(5Rxrp}(D!jt;2Qd>2CWTsolR5|2FloK?kZ^l*abPD9BB`wHxpS9H&r)$nN zsVMVA+W4WUi`wnnlq+`W#@SIu?!r5qlo7QT>i$4DFw>=(9v4=$mh&j0s^m2w7x{zdmITNh*eMH0%IP zfOL8T!A>MBqX?ns{L-eKAo)2OD}mQ}!=M@0?M>P-QPJ_kTEgs76ZzOr4*`-#_ao6l&3qjk0|%D0m& z1kPG*K{pF8NH+tD%wp}_I$~IjHF^=1S9&5|*2=Ke7ZVWC-?_=`^obe$Av?c zZ~4t=vG|0!SHJFhN~F64VXNnlLXJZsLJg!n2Y$X{sg)d`=W7>d z_Cyv_-nKu8RWBAco0t^dw?)#q0oY~SiP80@4(?J-ci~aIpM<=*--`3@? z3J*BeD(51Q+!%G|VMSj^Q{t=|YnRA(4bdXo$-A zsMfc`(xm(qsh|0mhC40^eGvzL2U@M2ZRh(}5!-6Gr**^u>)U~A9~zGW0+eqq=;9|& z*O~g)rh3AtziOB%ucEPw{yGUvOwCFAcB8g5bF4%}_J{O)XX?$Lt9-)#ixT4XymW2z zxg#$x*dG{J99)Wdd9CdL!V#}oTYcr~jnNP8`UXy~RD{SFeAo4&9Jjc9BWU0{q9>er@W}JQlB``qQfD~N*d8}{FcwTI zN<`O3xSiO;t!U|k3yN(?D{uROR#+u(UJH>5omM)NlWXrCWSZUIBu}#rgZoXDZk1iL zxhm@0AjzB*_Vu_9DLbsZ?_+-c6$}8Jp92B^kO|xW*hG7YHAceF3^pjz_{0#=`nWjZ8BOwJ#Q|GQL>GC^2h!q+X;_O7eQSlXVQ=mgC1C z+8dXevoKZ<$YZWnlU*}P(;l%lr8wJ+H&b%wQegruZY}i;k`L(&1ZE1AtM_MiLK&e< zu=pBRPPIES>l%zX)SmsjW0Teu!QBRgW?W}rcdmDeIm%#CzPwfTCszCunt7qgxVwQP zl7USh${5EOzZ6)1;F1GxUX~4fRa^*q9)g`R9`xcZPIEQt#3%T-n`a#e=K88aAQaxh zRn)bSI}&aZdB2)2f>P%s*0+9A<`mZ)gzPp5Y+||B7%4LF-H0J&xN%dK67Ex;ECaYi8!#>Ko1z(FE~;-ud{`@@PPBFq!{0;a(v?GGJO-WQY?6i!~EW^Yivf)wpWO( za{Ss^sRg^1bY1UqzYpqCY zjI#m840S!TaZMX( z%*Fxs#O&HWa*PgPt{3vbSe%FTNmlFR6wy1UcZWK#XQmz6*JZXsQ`Z_slUoIFw<;UW zr9E4WU(7%p4b?Y?$6^O=sXJBkjNr&0(#lrj^tCElN(t`|;!fMAusBk2{W#x-spP&X zlacqb@QV*x)*lratah>G>^DDkjrmRR?nn)kq{q2!4{hZvwEB$hFs;!k8zjdK*GrDm ze(vD>-2RUB<6dy#>QT3WLX~C8UqR#0lYzUvM1EWGNbl9OJ&1x~D2qL1gHW11>@5^+jrNTdt*w!zED=q!9gmDni~Q4`E>URJ}zmI7%YwrAgtM_S402L)NC{ zG70^J4d9Gm=MD6Qy>NbUbOdi84D-azXoxfF#v(ylXq9KuV8z9hPjMAzhELQ2<@LxC z4q5g1S-V`59&*q(XZIdX=ajVZK1W3K&w?u8?W?x{au&wBQNmyRJy$=^tBdqk=2@po zSbHJiGkpge5{n9d$cA69v#Jrpa0FUcgr=}mTkZCKv}Q0-STcN_>rlqi!droe5HK6t zU6TQ|ujb@qma4mZW2X1)u2HbZyu*7^#D3TkO?QH*ZXN|Ni7`D^Jbc+F@ z>R)-8J?|n>9aqQ7pP5EzLWNk^Yp})N={lT{gvSbw*|4fAc9Bgc2K40C^Q_r43q*mR zS!5GE*Jw&wal8K5$b60i{tzN^?Xj=;We+4e+y8d{TF#z91~BoBSE4AvQY;JCLVx^A z1$()48Cbu#RS#rJET#huA#`aY2<`AjSpvjwXg~rSQe{_R@=@Am*PizyqX~0gxm$$u zM6cHoD14!+@Kt4a@w}b4iW>^^8Ie`(_Yj%&(|p<6vXnn4Qj{EuFN0081Q3jHfnBsLV9=Ue(YHt*R}i%R!lYUe9#YvADDU6nw)p)FxzYjv htp8WHScEiwMUnREWfJr%0s;X*01dx;gGv6K^dE_0Nv8k+