From 3dd2f0a1186a816b3595e3bff494d5b440637b97 Mon Sep 17 00:00:00 2001 From: Marcos Romero Lamas Date: Sat, 1 Feb 2025 18:54:14 +0100 Subject: [PATCH] test: add test file --- src/markitdown/_markitdown.py | 41 +++++++++++++++------------------ tests/test_files/equation.docx | Bin 0 -> 3907 bytes tests/test_markitdown.py | 10 ++++++++ 3 files changed, 28 insertions(+), 23 deletions(-) create mode 100644 tests/test_files/equation.docx diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 9185b4e..82b6226 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -748,23 +748,23 @@ class DocxConverter(HtmlConverter): os.path.dirname(os.path.abspath(__file__))), 'xsl', ) - self._mml2tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") - self._omml2mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") + self._mml_to_tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") + self._omml_to_mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") - def _mml2tex(self, mml_xml: str) -> str: + def _mml_to_tex(self, mml_xml: str) -> str: tree = ET.fromstring(mml_xml) - transform = ET.XSLT(ET.parse(self._mml2tex_xsl)) + transform = ET.XSLT(ET.parse(self._mml_to_tex_xsl)) return str(transform(tree)) - def _omml2mml(self, formula_xml: str) -> str: + def _omml_to_mml(self, formula_xml: str) -> str: xml_content = self._template.safe_substitute(formula_xml=formula_xml) tree = ET.fromstring(xml_content) - transform = ET.XSLT(ET.parse(self._omml2mml_xsl)) + transform = ET.XSLT(ET.parse(self._omml_to_mml_xsl)) return str(transform(tree)) - def _omml2tex(self, omml_xml: str) -> str: - mml_xml = self._omml2mml(omml_xml) - return self._mml2tex(mml_xml) + def _omml_to_tex(self, omml_xml: str) -> str: + mml_xml = self._omml_to_mml(omml_xml) + return self._mml_to_tex(mml_xml) def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX @@ -774,16 +774,16 @@ class DocxConverter(HtmlConverter): result = None # preprocess docx equations in docx file - docx_file = self._encapsulate_equations(local_path) + docx_file = self._quote_equations(local_path) style_map = kwargs.get("style_map", None) result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = self._convert_omath_to_tex(result.value) + html_content = self._unquote_omath_to_tex(result.value) result = self._convert(html_content) return result - def _encapsulate_omath(self, xml_content: str) -> str: + def _quote_omath(self, xml_content: str) -> str: def replace(match): quoted_omath = quote(match.group(0)) return "$formula$ {} $/formula$".format(quoted_omath) @@ -792,22 +792,17 @@ class DocxConverter(HtmlConverter): xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content) return xml_content - def _convert_omath_to_tex(self, html: str) -> str: + def _unquote_omath_to_tex(self, html: str) -> str: def replace(match): omml_content = unquote(match.group(1)) - return self._omml2tex(omml_content) + return self._omml_to_tex(omml_content) return self._formula_re.sub(replace, html) - def _encapsulate_equations(self, docx_filename: str) -> BytesIO: + def _quote_equations(self, docx_filename: str) -> BytesIO: """ - Surrounds all OMML equations in the docx file with $formula$ and $/formula$ tags. - - Args: - docx_filename: The path to the docx file to process. - - Returns: - docx file with OMML equations encapsulated in $formula$ and $/formula$ tags. + Surrounds all OMML equations in the docx file with $formula$ and + $/formula$ tags. """ doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml") output_zip = BytesIO() @@ -818,7 +813,7 @@ class DocxConverter(HtmlConverter): if item.filename not in doc_files: z_out.writestr(item, z_in.read(item.filename)) else: - xml_content = self._encapsulate_omath( + xml_content = self._quote_omath( z_in.read(item.filename).decode("utf8") ).encode("utf8") z_out.writestr(item.filename, xml_content) diff --git a/tests/test_files/equation.docx b/tests/test_files/equation.docx new file mode 100644 index 0000000000000000000000000000000000000000..e61721a33187c6d4041cdf4f2501c3e798d8dcfb GIT binary patch literal 3907 zcmai1XH-+$)(u1mRRYoxlt`6cLzkwMP(l%e(3B!Ap($OOpfr&pNL2zNO$4M#2SYEB zrZg#+u5>}9NO+0&yPrJYyZ5a##u?|wnfshI_grhw1&0C8&;!WG$N&nm?==CZf)f91 z?P>4oB_d3?mn7=KfGSiEHjf$9Hl|z!BD=7A5LGwGH&A%m+QtADB;izYGOa>U2anF~ zZ1CIJIzNQO;;`Re{R%w}oF$W~d@UW11JE*5*exSTjrS_A{kF|yZdBJK^W`C~`J6aO zx&`(&bh-{&&BgTRV8+>YpTl7wvXud%OXK)_6^Q`=_}?-@hkwJz&C^cA!3}lC_~tEF zdtqOsYe|A43L{Dt@%@?Vz;stX{LHnri+lrxT>~#b*+@B_>gKk6E;Y`&NUR1>WWGJ2|~JpAf(Vdvv`3#1Bx zwdJe_A6oFgmX~Z{8_m@72_`vz0jl|D76-l2SJg*vEFA)#We@o#jj->5{5fipH=Y*Zje}VN1S}tLfo-E*va)lty^FL*-64?pr^Mp>Dc*^K!#D3mm-%E=gJ_*5 zV#pe`?}enIf<=S54Av&u;3Hum8jjmOdl@%-4_|WC=IkU5Qhk*KhMA|LhJg2N>#dR5 zuM@w9BtCT9p0`tsf=);2>;~#*{ox za-^?6dtO|3-Dax(e55stjf_!cZ&HU}aEMZ(t7Q`v^<9zl3<-KHf@7IsV>b^<+(&J( zD#Ez9Ed#6TfCL62kDuIni_b3}&%)m;i1sfAPUqfl+K-wPrE1#br^=G1D#;WUkh`R3 zP`RSVPW&POa5V*qPFc|M99kpypW-Oj7uWu(pU#L7qrwW`a!fo_GK|%EkKMe=P?9EX z(M_)#XG<@pb-lwrA*ZxvRO2?EcuH5SvGyB!WSYwI4zxlyk$-}W2YE(bGpT@H%ADsx ziVoO|OXfpQ!6yy_Zo1^<%r_?4(yI9dmDPHpsQYo(`?6KP8Wc~^B9|u(5Ic|D1Z^vc6580+Jw})N- zvECh0u8x9MxH|KuA;g;n75btXp611=XYYS32a2SvD+Dg{f2R$K357J%MxitF zR}v$Wm$*F1FZx*~5E&cQ~*E=d}7XOo_5a8mDqvRvizJmzt=3aG-##MYxxUb%B^^fp!B zzMt(YT+H+jpQo%x-ssN8xo7rm*CLt&4wjr&%L2}=H>z|r<7nI&l!~#q>oWdYR{lok zw=T$>U7E+d2WI2`*g877<}3Aww-se>*1SD)RVaGu<1rcEc%x-}m@LKY4Z&vlXwl_h zg^Z3kVI53TQj*gr*-v2h+VLTC617=7)&|c!!TTOl0#|K8d3Ic@$dN z)c>S(n!l9pb;r-u-iuJM_6q4xLQY6K#^n;gbNp{asKkHG*J(x#L0}E)<~I& z5X}&nirSl;^vtX#d9%1GGmj$nPa?a%EQuqjfopYgdQPN=%1g$xH4(-v%$Y^Q+bxsOJe8!O>ryUVuCO8 zZC;jA>V%aa-}*uF8%!d%r)=^&zHV53sHW*9)I2?1ktzgS@+cF=o|v26mvg84^Sc`l zH+J+a7iv}EN${+0Q z3@d=$R7`-rh!sm4Nlhx}AjoAjD`?oa5hvy*Z#dwP!xFv#Vwj--UrF1EXxEmLFc;S0 zRdsU}N_xcow5S8Fp+WVyN{Gz4*BFXCuboN@8xSgJ{bncs*jjWy@LL-%!~Mxs3X7!wbb@8J(yw%%d|tl)=ElzU>cUL_iD}Bs!BjW4a-BX2bK|YZk_8@ ztsF?HO*icwm_kjT$RU&E9Qo?7hOS$G2g|xs zrhz_I!%{}%17o+{x6xP4*LIcC1F~f|!W^)berV)1q-Gp6zES{fwrRCUCTD42)K$WF z$aHC}srgHC@ z85A8K0G)(DxHBM{Za7(W#fNs))Ld!tEU=?>T?Ui)95VqD%`DHRv%Khd@f_=TD#jWr zZ!_PL)%n`LVfx(=Tu*25jyS0S(4AE~SV}D*8K<0cjzafqmqm?s!WC6-$=$cRb{#3~ zKI3O8mSz2Y)lbsUWfQUUSz!`t-cCU3?QBWCCfBd7JTy1jjSG-n;y-*4E)nx|GieTp82GXoY0_MkDnj66kAcBr z2%jzDe;$MX&xfs>=T9H?C=sELxyVc~nAG2e%05f#PI5bYFVda6EjdlQUrpP~8M3;! znRmAx(;1gLPRLb8<=quZkgO*C0l0kSb&{xt; z_%aHHICMY023!RUXk1GIi9|vCS3Kv*er-uP%zV z5@Q-gYq}%jv|E#gg5G+A$>Pu?Byzqwl+n#*+?dvPvVPDkn))nu$td0HL?Y%4sLTpUx*Tekj zy1GoC$ha$$aepR?y0i<%c8z7UL2wD9PxjW$pkPjAdhPc1#Kp7&hjaDjsJ_$^&TQ{$ zhy1JE;$=hp$fNYBs?h@ygmg^Nlz&TXC8LXU+lq*+hHMD!4H=pgJvpY>$VzI(@2X1C z9Q$Tw&%8dno?+Wb6~AFLMJMb?GOiu&dx>f!b_T?pz~p2<{=rVOFP(f-L+I*@pg`mY zkU36U8=FN6THg*o6C14S@2#>OM^@s?jMnV2CbArLBJb(uRZD@iAIx%H9O9L*71P)h z66;ssP53dh6ilPIvA+{VJF&4I>LE|e(cJ#zi2YeZ@vMR;LqF4*#kD95a5$pEjHXWBn@(@hbR<{~MeA`%0$|bb>+s6~_34f39@euKo@`-2w@x z{;yCX{Xh6G5AZwqbiW{U$zK5@|IeQJJNmTl|NYMq6sORCb(r7br}vdm9KT`^zm@-` YRNycYd`bWS8U8xP?~*9q0s#R30)%r}JOBUy literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 689d6f3..300ced3 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -272,6 +272,10 @@ def test_markitdown_local() -> None: assert "# Test" in result.text_content + + + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", @@ -358,6 +362,12 @@ def test_markitdown_llm() -> None: for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() +def test_equation() -> None: + expected_string = r'${\left(x+a\right)}^{n}={\sum }\_{k=0}^{n}\left(\begin{array}{c}n\\ k\end{array}\right){x}^{k}{a}^{n-k}$' + markitdown = MarkItDown() + result = markitdown.convert( os.path.join(TEST_FILES_DIR, "equation.docx")) + assert expected_string == result.text_content.strip() + if __name__ == "__main__": """Runs this file's tests from the command line."""