From 591c2f7801f13aecaf27f253ae299242df6e421e Mon Sep 17 00:00:00 2001 From: Yuzhong Zhang <141388234+BetterAndBetterII@users.noreply.github.com> Date: Tue, 1 Apr 2025 12:46:43 +0800 Subject: [PATCH] finish parse merged cells --- .../markitdown/converters/_xlsx_converter.py | 61 ++++++++++++++++ packages/markitdown/tests/_test_vectors.py | 19 +++++ .../markitdown/tests/test_files/test.xlsx | Bin 11562 -> 12417 bytes .../markitdown/tests/test_module_vectors.py | 65 +++++++++++++++++- 4 files changed, 143 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 28f73a0..c2bf726 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -81,6 +81,10 @@ class XlsxConverter(DocumentConverter): ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + if kwargs.get("fill_merged_cells", False): + md_content = self._parse_merged_cells(file_stream, sheets, **kwargs) + return DocumentConverterResult(markdown=md_content.strip()) + md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -94,6 +98,63 @@ class XlsxConverter(DocumentConverter): return DocumentConverterResult(markdown=md_content.strip()) + def _parse_merged_cells( + self, file_stream: BinaryIO, sheets: dict[str, pd.DataFrame], **kwargs: Any + ) -> str: + """Use openpyxl to parse merged cells + + Args: + file_stream: BinaryIO + Returns: + str + """ + wb = openpyxl.load_workbook(file_stream) + + merged_cells_info = {} + for sheet in wb.worksheets: + merged_cells = {} + for row in sheet.merged_cells.ranges: + min_col, min_row, max_col, max_row = row.bounds + common_value = sheet.cell(row=min_row, column=min_col).value + for row in range(min_row, max_row + 1): + for col in range(min_col, max_col + 1): + merged_cells[(row, col)] = common_value + # Merged header at first (sort by row) + merged_cells = dict( + sorted(merged_cells.items(), key=lambda x: (x[0][0], x[0][1])) + ) + merged_cells_info[sheet.title] = merged_cells + + wb.close() + + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + df = sheets[s] + for (row, col), value in merged_cells_info[s].items(): + if row == 1: + # Header row merged. + if col > len(df.columns): + # Insert new column + df.insert(col - 1, f"{value} {col-1}", "NaN") + elif str(df.columns[col - 1]).startswith("Unnamed"): + # Rename unnamed column + df.rename( + columns={df.columns[col - 1]: f"{value} {col-1}"}, + inplace=True, + ) + else: + df.at[row - 2, df.columns[col - 1]] = value + html_content = df.to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + + return md_content + class XlsConverter(DocumentConverter): """ diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 4a7b54a..73d08c6 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -276,3 +276,22 @@ DATA_URI_TEST_VECTORS = [ ], ), ] + + +MERGED_CELLS_TEST_VECTORS = [ + FileTestVector( + filename="test.xlsx", + mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + charset=None, + url=None, + must_include=[ + "722.0 | NaN | NaN", + "NaN | 42.000000 | NaN", + "Merged Column | Merged Column 6 | Merged Column 2 | Merged Column 2 8", + "## 09060124-b5e7-4717-9d07-3c046eb", + "6ff4173b-42a5-4784-9b19-f49caff4d93d", + "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", + ], + must_not_include=["Unnamed"], + ), +] diff --git a/packages/markitdown/tests/test_files/test.xlsx b/packages/markitdown/tests/test_files/test.xlsx index 3a41e176eb860d6d78d92bcb2f00b2524d925df5..8739d8018dfcf8d7b3d2dc0295802253c922f838 100644 GIT binary patch delta 5891 zcmZ8lWmFuxx*gozDa8kOC{VO$Deh39xD^Hlmm-5xq}bray|~*@ylC;_6bi%OS|~1$ zbMJZY+;{UME3#MCO7iXf?WEi&*rJvf1Hy}1_B##*03gEx00;m8fDee*8|>z22?jgz z_&9+o^_;*PqJ$5D(+`nsYcmTByv!W$h-4d6mdFY~eyj|7Ed2cr#M>a9GX2{%ZyL-= zgvUmEWcR?u=Bc3Igle@kxMJF;SXRK4K-E87HFS<@;_X-J^flWrvm$t3l(M0aqR?*8|&kM^$p?|c!PQdg9pw1kq z1LiJ!ulv>mR^CP*L%gzTIkA8Zxw<*&!$dEIVa1JCmm9LQZmM{qOAqR&;!Oq8d z?4st<2UxvB-D@-qqGvicmzh}DO(?|3-sa42<;mS-buB6tUh+Zhr5qexrF?bJYPOAP zRT2I=6VH;qCXl6+`eENCyGu&#{~EfEQX2MExWmXuQhhxfF?6tS{P0K(bdi*G;^y%3CvF`kCbTa#X5`Q26a%e}9 zmaHIc;38@^0WHCR;3#lke_?)9&F`Y$O)n@ufB^P$m10C6jc|Ys;^O%d^EJt{^kYn* zxGcLrmt+T%buOQSF*~25U7<7LF>jeOBByFwc!2&=X^p|zz4Dz{lOn@!A}?Dxau#Lb#ec zM)tU0yl(TY{xHnXL59?_>L_BNt_QnZ`y$Hckn`E*vVtSM;aQqzhgdAo=04~k#cFWn*2jQXHDensrT*a z&W!Cnd)A+3T%x2!sqN9<7@+P}(IlSzE^O#|dT0}f=Oj!Stirxa_jVU+?kmv+#3k&- zAfi5n%k75bQTq7-v?NMn%;G35E+V}DnR7WK{z#S-aFOd!OPIPD{U)jQ)Q%s^{{bIc z9zgnr?fu=$7OzFFtaZ&;r8OS=1QXP@2s=%V(grD`I~Z9HA=-l0x`b4jn(^x{G3{5w z1{@uWV>m+!WSY$sKukx;L^2o_#3z_V;jnBu=(9II-|D?9#P_1!lo_ zuUw^`n`KNPVzmWtvb5e0BnUc;QMyMoWV$gZOGx}|*~=@J#j18RNzd`I|l8?t(_*iSyHsraO)9k=IpxbhWv`Epsq}qMTP~0!~&3W3C7i>j? z^9hS_=A^46G{`?7NIgVoN(;VbN_6#xFC_;K`9-zDLB51x>M zMs27+L2e(sgQz&(=S3>-@9O@u<&xx1-8r*)k(Rqbs-0dMUUl%;R^XGtQ%Oq{Ww#NGItARs-JD+S0c*EV2Qsf zpJ7#-_o4IvnITys1?R-a5D(33?Ue#E1pm)EGjLz(9_g8qOHlM4R-C9J)&-x`fG^X& zB$A!$dcuGx-H1fzl#4h1YPulS9AQx*b=a;p^^=a4J4P}-1_ptQmIXDJ${ zi{kKbU7vAgcwC;zqfCR%ppKe!@JYwy>4fAl#ycW%w zNl%pt=x;1B_2U~p8*gzDg5u61dtXz(rY#6Vz{0?a)W@&!i3eP%f>Ge2TH8(MN;Z~S zRL`iFQv>4DGTQjYQ)eUs1AY3p%qX(Kbj9o3q)cy0Lo;Kad3IgKK_#~3yF zn?R$Tig6+y=YnF;hXL0I+*pxCU|nX`HAjeLtBdA3XJ2@Ga>bZCyMb$91#OjySoYw^ zVzKol#yXL5qjwm*m68%1B+WwO2i0ejRiZbytXV4116`%Ho{_G;8GFZaeA3{W`&OsTUXZ;1V_okOUsJ9@>#P7e@?r$Nom zi=PBUzjIkS9kB3jQ6=Xa<)4ejlO?4Mx6d>T4wKfi_Rhmoasz%1N#F~{2&5~T(*JUU z5Fehc?-h^wR>r@>Url_U^0EQAhXGWj)a|d^NM8W1$tAxYnHUL$*Ae+;; zRrscu(VahhvBjLATC~%b??@m#W5U8#YnCPYwr@jE0=U8J#uo5|yDLWT0{HX_V$^Qz zY*K>rJ`aZV(b(8(y(T%b`q4bTVaExfQhq7nZ07X%M&PCY-)f3I8S#9ID%f zdK_=pIU-9DuglU>9f7O6Rr5UitL_q6eqkc4^4z{STYt&mgP?*>7aTI(S~E;9k_IIM zQjOPOUD037S|LYxS<~7_Mtm1qpdh2gF@9l7$ejw;NsR%7IhB8?&bT4?86-*~=-E(S z*IJPM1ABmZfW3zjPD@ps9j_N(w*+U_SL30MRxP?nF;p|DE~34 z?)FeVJZy--UuHFHMg?5RM+o?Jt9(nxwbsKQ50YWfqcCy;&pLP?$5DllNgFQve%XW6 zFen?br*zSUFN07i~+@BWN9>CAb`LT%8}bh)*QqfZ4CCv>s0qow$KGGkP|v zFK`}^M+M(8?@J65lq}82F|*MQF@~#d!|9H}Nnd)XPiWG+9>#Iqw>Kp!n!{-1nIDO^D#@r#`LbL3A1d9jf}l zUBi9x|HzC`75q?oQiT>v!&gK-fw)P!3z+%L&lgK5mybFzqd9zK z0;7|8_`41xdCm2lcJGG@i?S=i5;gZ*j{VrfNb(d6Bc`6Q131)j$@Pr!>(RFNb8WB@N@&Ukq0Eshsg{$icQ8oB7)onFlk?LQ3{8%l$ z{y1pW?9u2^?tT!MugE);s2W~}E<$%?5Z{DZ$jURtB7@_5BA5$d?gSd6-{P{08OEtH zA&03e3P=IpNnd(^NSw;jNd@n`Ct~fq9HP*W6n5CjD|%#Q`44oiZJb&%opr%Y%aj>Y z{zwgQ(gk!#gGBtY+!J%+o}PGk|7MQ6hcC#+{V&dR=&ONe_$WJ>PM;Xl;NQsZl!98d z&&UuQ*2v)MNmAN$@_MtC3C$JW%e&qmu+dv11GhU7qjM_38xW3CvFd8hz4Vo)g-x9q zxp>HUeoKz6gKS#J+myJm%<~7j=ZNXVLOA5C)mxK<2E>ilw})p~wQRhya-7&+VA5NY z!9VdMRy->}a%f-Tbl(6gNtJOd&CBPV^&FIckI)jY;7NctuzQ)EiMAMTDF|iGVNSc& z%E|q;Wbp3?)LgOh!jQUDWhfa&d6P5?Rr!4szGjsl>!&{vbUgx99D38BTF&qEe}uQPKb3z3cd(9fid4=PjZxy#l)wsA-|~HTrQP|)7!IQB@!j@EkzMs86(4yD+zPysP*Oov3E-VtDp}#edmZ*na}RkqQntqeEwT3$9pr@;_+xcm=(2q5fOq&0d3_oC*t38fl+W6H%tS&( z`9>vtGC!~U4P<43_N8O04aadm9o4)73Gwk;KglVTOuw{ms;T9+byQ|l$3d}hY+hZc z(E{F8C-YBEyh#?U*H3aW&VG;=333q_xz3n9Ti&4ddPmmCcn!xERY<O747x!sNcY z`0#e1qhwDBG~XKoxs4@0pb*03x7dHRjX>1Yx2w7WGkY-_)1&Y>A7qT;och*y3?0pzM3@X5<8kIn0GAK8*|4*~3Kq4Q9Lb%!Y4j7ztq|1XmL zw$3%Fxm15q5mn7`YqN?@SvgKptP%)T?;+>zUZ_)GDdvSIhDyy|A+B>sDX+VmHVdaT;etLL@14OM$0*UJhmI?e z=ErP*3NXIi_Zm)~77MDM5*CYVfuFnV`ay#76f=Rt(Hv6tv0)h=AN|rO)js%S=iMWz zAtFn#)f@+$6y$WKH17Mn3=I9k54S<$OF0wk_`PW2F2y{*Xa`2$ta@5Vw|CEvdh(~nLcv~EjwpXkXL4wP9Q6z1-w z&!O3i#V4pApi*21St%?vDnbmRcWYphA>zUEr$x}|NAj@xE`_P9$V`HUZ*aO))==Oo}8r~Mic z*w=-ED`KVb4DdMaFuJ3jVy^i+3lF*}av36f3+w#tfO7$=0ru#uZKL1Q_B(}mqC!Ix z69QWN7F-g>I>CR?TG1y_UnJsg*IGKDlR_0~o7(%(~DrgMMO zmuQ`@7Ns<(&~u!1whX%??fR56FjnT8zyA*F)zit6khh8mccqvp$Tc)l3sXrE?*UU+ zuC*2}D*?Wcmf^KDGn~ThDUY5$Uw&F?)2SjgXQib#yn=TydLf6cB`*}J@FI}-ys59# znzU*Jn+E3LKoJG>D6Aybv>{A&UxM|I=kLSNrQ(~kbW}@L77tmm*rLj88luafj^ThC zHq%!wPU#p1w!$K|Kjh}(F7`d%V zL_hU#j!oKg^aH#POUs~cTC4g`?c*OPYo870UY$l2T_N&03emIX=Zcs!_=tnGglBAX zlWq^4?&9QWi-)b>;}vaU=9ok`i1;6Z@bF(;u^Ce4CL!0EYVYYGrJO_QKj&@ELG*~_ z_AY*GwduP!&03+M?FFoE7-V!-yK!5aMK<>*4y=&tqeLrX^Rp^X}m$dnWI)iHD{o&PtMG zrkVk@^i!401W}kyZ`A}ln zqgBk6N!=oiSIz1VHTKE-Zw>W&+++Nha_6R>H6A)wmmH%`I)Y~y*GQ*j$k|hcLH2AX zFg|~SW|j)sOiAkr$(ryBQ`#F1hNSDT>;@O?Lh4f9ZQci{5ql!`Ki|91KkRC#E7$-4 z^bngGYR*AO^UpWZza90{_5WSS@t{*gWYByLGMfLk-Tw;#0H{cxK-AC!4i02<3MeHf zkmkRQ#y?nQTIgF&I-37bMF8nPvrnxg252WOIkcRU1Nnjxx`+ZE z4&+B5^px=#G>wap=Kq8N82}*qf2(k^Li@OwX#Ngafc%r`A^d-U@9a=yZbtN0uD?eA E2l9g-3=AWFA%NC;9QjdXYX7z9KlXAlq&kQhq3lyqn*>28pa z&Y}6Z_gm}wzVG>Kue0~sYn{FJ^E~@(xLKiL859gyJnskKI3Umo0SIOoXuK)s#a z?9H8>?fHBh9lq!~I%f-$T?Y+af!n;P)O-l@Hy(y&7LAk?3Qo@{$3Fz8nd8B1#MBE% zuQBv;CPH~SHTA3$6OY7$tXIpwT~`&TEJqXl@R5M+e{iI!>i6mYJq|y!O37-nRg(Xm zmH;%Ma1KAI91izo0LG=?^!r6#(SS62;0BzdPZQZHL-d}S?1zt;8n|dd@Z|RGekf=n zdP1~yKO|aAH|aw3Qt4$``OpjR@p@jjRO-sxkHQHI$?=Y?&@|uViYinW9 zfz10=P;*9_!}%5KC)KMVy{ZKMZzL=pEC9Q$(T=avW>ri@jS9^_E+C*Il+s4@8cg{M+LF$n0wv)_Ite>?%=vgo0 z8)VKucb$28&&dY4?&kJC4Sm&AM+kICGN3q55MZQ zq~_q(b>J@g3`P-Ki9eb}_VtHszivOki7q`kk*&M=JUQ945qwO2(zb5MAei~KWA)>x z$g8_*6d~qw&lEPtj(Xj6gG2rH9z{FtK)X$d7vB3{NoSC?3^w4e5@zHqBU^wl5BHW% zQI?VP57J+@qw7X7*^*NHblr8%I(PhHj%b#ibc$wKIEz&U(u(n=_zOxq1>;bxdIkG^ z+&=dR!o_jGz%+_NXMa<3Rf5gdOI||TYVO(Ht<&<>u9oQfAd@gr{T5DN*TsMlUpbKj zAJ3-0QE*y{YBWuYvUo;i00;nC^R6y6_3TrD?*)582p&_i8nsY4$WrmgXLdh<{I&2EOxS?ZYK1Z0Wr%AqC7d{V=YHS|BIEe@PYCxGi8Rj~F?77`6 zS_)j+h+g~Gh8|}SZWEPE>!HRp*oEe?kYpqjmA{EZv59am43fQYL7*4+ko*+X0RP|k zOhfcI6~V~DF^a#%^yk1hY@=L61%(QO8$0>6jTYgi?NoMRLb`#N7-i*$R`)335)-YM zpRqP>wZoAP-+gV>_`no{<8IJ6-)|eAjqu`HA__>N;W7T zGi||IlHLzKzR>du%CvPQ)Nrpg^nPIG`8Fc#xxkh`C!1kHtf~e5E6$_F9wNT3{>4wr z?h;)`n>H$jL^h3%vLTjU`DZiNy!BSbbjoYj&=yNP_pOk@z-zu@18BxyfAmtT9$F1!yEyM zcCSwj!#L`%z;2WH6aXIryRpNXYxC+(!*@RUeM@nM2Kv?Y6=D$R=H`}2Z*FpuW@A!e zWMQw^mau3#I&k%Pw9!!V+<3q0;zUlz`KFFXZ3nJ_!;?j$dF7r0K&RR?=HhCxG1rRd?RDeRAhb;dtW9l|+OKZZFkaPVL<@y} zU z8lxn-^r`OwV|p&R7{9Q(bbsh9U7kH8>Hv2+0p`UuYH+|uAZ z$wre<^UYR^1!xo0?r9z>2K++*6Wb}iG!8`WFavV$PH-Qh$D)7Ue^Hv$|qU@#{ElAOkQpwu^wo^a`v&s~g zkXAgTQ(t`7cYm|JRYtiAXESt=QezqS!N5$FY$^BUVW+A`Z2X~Gtr`sprDRC!-JW{r z`^IwGaU35Cf+^jvm>^C<*9LIAu><~qS?(h}enu4Fr9NW{uO3!ZLGUPBKjr_t|4wtt z=ss!__ie{sG4}Q|GGfHpK=Wq%Lcd_UGn&gEqxw_R!fz_U2oikmus zQM+4MbOKi4vS#yfn5Fjlp7 zHM0VC%Px^+lXy?A`HY<8jTWVBWh%{|_(!GxPE)8#&l_y;>CY||;Z09XL18?vsGwBE z1Y+WKvVYYPNujKCOe>7l&d%6OcO^$&M=f_J_kG=?u|!C_Fu`Nd(mdi`nr{*)orZiP zj&=z}$FP+7dSj12-U!H_*30vHIB;L3P67Z<`A(oPR1k%pN{+6Oc(UL_U_}=Dp5<&e zOJXvUP2m?zG0$xT5GN^y^j@mpS7S2BCC9Em1^!#f5AdV_@cS+)ruv|-G;As zE}A-*hSH|Av);!uUp6eT5M8IEs<~efF|@tS9X)D&U+g&pCvRmeQ~y!GF(5jB47iAH zuD7X%bM%+RL1kyX|za~ng*|)OT(EVuLTxcy=sfss80*tWp>@=xPX-gnb4Lq0el<@>_7BbuGL zSDrZg^hQNj&P-H}u{BsKltdyfG=dEDXK(fI~>j|5WbMAe25FrLejcHhE+2n+&| z+}8YdsCM`8b+B^(Te7Qkr=6x@cLEv90J5WM(W;}dK-1nVG~$AYVa&EAT#gbMUI%PhjR%RRL75?#~dOY?__^C zPRe0&y|3{u$9R#!jJZb&D}F?LD)W`Wc=+i`qfeh%D_&rbni+jn?yux#bk1 zrYAG?qtD%3*ZazaJVqHVS;5CZrP|{ap1TZLZ$}e7-@Ws^P8`0VCyM|)m zPlC1ir8t|GEL7OU7};DvFN1+ zWH#xkVv1}K;NC)s9_n*DQ7Hk@&(E}^MhZ z7~c&T%`S*V8)EufDuba@)ai@PLOnuJK+cR^;X{JKf$^;f8BT z8f)VlSWTS>FogL_8j@SY=in2am60@_l=2JPVY z5m%nPy=cSI7fJCW!vCOU|1SjRZ*VS01`l15nv3U`(Q;V}ZVJMhW~C51Zkc|-{Om61 z@5z*ixWCH&^tQ!|v5S&f?hcmk&i#lp-EdB3JD~x-1vn$t55`FK z(VeCp)(=dr{#%>e25C*zkxeeH$j6VBN%`Lm9O-GS>LFt>FANb~zA*BH$NES{kChQ| zDZ;e?in@+b%cGzmv)3}u0R!gOk>Cw|N zTI&~&0!t?5K0}=BLY)nZ`a{kX%9;aedvNR-Cp_(K9>dN2Pn5AkJJ<&^1|ZV$NcqfIB)t<%K`RbdcSmIs?0FG}zgWnb==?alT< z|DjYIkftgQE(NIal0^cSP$1(ohx)BX+E&VPuwVhNp0zSjw9l(wl9KH54j8;t8gG}H zM5?NN89Y^3aI)ta098)cCe=)T=|3e{n_*m=iTmkY7$b?U>t3hMB9wu=HnYqfe{`hH zx5O2hwq|%R^*Z^pgs>o#59&4uGvg6A7!E$B{K^?uWs&vqb2Nseb>bZf=MG|FK*pH5 z=<8iTvA%F+1j=gvF1A$Cb~?!C(R9Oihc_xkMV`OIB_lHqO%V~zX6<7`AUR+ z=5F-JFF862&iX2I^=m+{x6E5&PlA0Ye`DkiR+NFVz?08BeF|+4GQ0LC*EpO0X*WW4 zgmoC|wzTq<%UqmT0Nx2{_)g4F!rEEVa{p@gmXO?KKsBmG2bh!%mHP!rc-(Dj2F``f)&C8uNc z_~Ld1%2j2|9F*pjD8YOpJ8EmK8ZE9*rEDD|j0N1P#S3!6kTkY~t%;+bBKqW$t!paRYHl+scJ9e9`0Kfl@8b@$bj!c*`Cs z2cHo5PX_f9hHw{-8ox72d&y0!Y`i4 zxp`W_HLuj|G>Nb}l=nL&D)K1r*4uDI>upqFW7lTZG8sNM|}} zZ)9t>@u`my+A4nc;l}3(Bia{&f1S#uH3@zCfkK=RP`4ZXwD~NSyteov!a>AhoM)~j zr(@&d>U}e^oF|^>&u$5+#;cC?6Pd!xOZ(>+3j*E01<#*Dh+N|3ru%oKyw%SC_E$rY zTzpTlHz<(Kd~9_8tFeFb6w@Nh_!#N_FcSzw@yB~>WicR2nEyhC@{u8NACTSo