From ca6dc80e229bc04716d0cd424242c8f89d5547a2 Mon Sep 17 00:00:00 2001 From: Marcos Romero Lamas Date: Mon, 20 Jan 2025 01:04:12 +0100 Subject: [PATCH 1/4] feat: add some deps --- src/markitdown/_markitdown.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..c54487e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -8,11 +8,13 @@ import mimetypes import os import re import shutil +import string import subprocess import sys import tempfile import traceback import zipfile +from io import BytesIO from xml.dom import minidom from typing import Any, Dict, List, Optional, Union from pathlib import Path @@ -32,6 +34,7 @@ import puremagic import requests from bs4 import BeautifulSoup from charset_normalizer import from_path +from lxml import etree as ET # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False From 002c6d1b30aa341891a3a5066b2b939d5595558b Mon Sep 17 00:00:00 2001 From: Marcos Romero Lamas Date: Mon, 20 Jan 2025 01:13:30 +0100 Subject: [PATCH 2/4] feat: preprocess eqns before html conversion --- src/markitdown/_markitdown.py | 88 +++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index c54487e..2554c70 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -704,6 +704,42 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ + self.nsmap = { + "m": "http://schemas.openxmlformats.org/officeDocument/2006/math", + "o": "urn:schemas-microsoft-com:office:office", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "v": "urn:schemas-microsoft-com:vml", + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "cx": "http://schemas.microsoft.com/office/drawing/2014/chartex", + "cx1": "http://schemas.microsoft.com/office/drawing/2015/9/8/chartex", + "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", + "w10": "urn:schemas-microsoft-com:office:word", + "w14": "http://schemas.microsoft.com/office/word/2010/wordml", + "w15": "http://schemas.microsoft.com/office/word/2012/wordml", + "w16se": "http://schemas.microsoft.com/office/word/2015/wordml/symex", + "wpc": "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas", + "wne": "http://schemas.microsoft.com/office/word/2006/wordml", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "wp14": "http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing", + "wpg": "http://schemas.microsoft.com/office/word/2010/wordprocessingGroup", + "wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk", + "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", + } + def _mml2tex(self, mml_xml: str) -> str: + tree = ET.fromstring(mml_xml) + transform = ET.XSLT(ET.parse(self._mml2tex_xsl_filename)) + return str(transform(tree)) + + def _omml2mml(self, omml_xml: str) -> str: + xml_content = self._template.safe_substitute(omml_xml=omml_xml) + tree = ET.fromstring(xml_content) + transform = ET.XSLT(ET.parse(self._omml2mml_xsl_filename)) + return str(transform(tree)) + + def _omml2tex(self, omml_xml: str) -> str: + mml_xml = self._omml2mml(omml_xml) + return self._mml2tex(mml_xml) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") @@ -711,15 +747,57 @@ class DocxConverter(HtmlConverter): return None result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) + # preprocess docx equations in docx file + docx_file = self._encapsulate_equations(local_path) + style_map = kwargs.get("style_map", None) - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) + result = mammoth.convert_to_html(docx_file, style_map=style_map) + html_content = self._convert_omath_to_tex(result.value) + result = self._convert(html_content) return result + def _encapsulate_omath(self, xml_content: str) -> str: + def replace(match): + quoted_omath = quote(match.group(0)) + return "$omml$ {} $/omml$".format(quoted_omath) + + xml_content = self._omath_pattern.sub(replace, xml_content) + xml_content = self._omath_para_pattern.sub(lambda m: m.group(1), xml_content) + return xml_content + + def _convert_omath_to_tex(self, html: str) -> str: + def replace(match): + omml_content = unquote(match.group(1)) + return self._omml2tex(omml_content) + + return self._omml_pattern.sub(replace, html) + + def _encapsulate_equations(self, docx_filename: str) -> BytesIO: + """ + + Args: + docx_filename: The path to the docx file to process. + + Returns: + docx file with OMML equations encapsulated in $omml$ and $/omml$ tags. + """ + doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml") + output_zip = BytesIO() + with zipfile.ZipFile(docx_filename, "r") as z_in: + with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as z_out: + z_out.comment = z_in.comment + for item in z_in.infolist(): + if item.filename not in doc_files: + z_out.writestr(item, z_in.read(item.filename)) + else: + xml_content = self._encapsulate_omath( + z_in.read(item.filename).decode("utf8") + ).encode("utf8") + z_out.writestr(item.filename, xml_content) + output_zip.seek(0) + return output_zip + class XlsxConverter(HtmlConverter): """ From fea4a0687e5cc8ac6bc1b5c2e2e53ac473738f66 Mon Sep 17 00:00:00 2001 From: Marcos Romero Lamas Date: Tue, 21 Jan 2025 01:06:39 +0100 Subject: [PATCH 3/4] feat: surround eqs and convert them to latex Office equations are surrounded with $formula$ $/formula$ before they are converted to html using mammoth. After HTML is generated, they are converted to latex using XSL --- src/markitdown/_markitdown.py | 45 ++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2554c70..9185b4e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -704,6 +704,13 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ + def __init__(self): + self._omath_re = re.compile(r"]*>.+?", flags=re.S) + self._omath_para_re = re.compile( + r"(.+?)", flags=re.S + ) + self._formula_re = re.compile(r"\$formula\$(.+?)\$/formula\$") + self.nsmap = { "m": "http://schemas.openxmlformats.org/officeDocument/2006/math", "o": "urn:schemas-microsoft-com:office:office", @@ -725,15 +732,34 @@ class DocxConverter(HtmlConverter): "wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk", "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", } + self._xmlns_str = " ".join( + 'xmlns:{}="{}"'.format(key, value) for key, value in self.nsmap.items() + ) + self._template = string.Template( + """ + + $formula_xml + """.format( + self._xmlns_str + ) + ) + self._xsl_folder = os.path.join( + os.path.dirname( + os.path.dirname(os.path.abspath(__file__))), + 'xsl', + ) + self._mml2tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") + self._omml2mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") + def _mml2tex(self, mml_xml: str) -> str: tree = ET.fromstring(mml_xml) - transform = ET.XSLT(ET.parse(self._mml2tex_xsl_filename)) + transform = ET.XSLT(ET.parse(self._mml2tex_xsl)) return str(transform(tree)) - def _omml2mml(self, omml_xml: str) -> str: - xml_content = self._template.safe_substitute(omml_xml=omml_xml) + def _omml2mml(self, formula_xml: str) -> str: + xml_content = self._template.safe_substitute(formula_xml=formula_xml) tree = ET.fromstring(xml_content) - transform = ET.XSLT(ET.parse(self._omml2mml_xsl_filename)) + transform = ET.XSLT(ET.parse(self._omml2mml_xsl)) return str(transform(tree)) def _omml2tex(self, omml_xml: str) -> str: @@ -760,10 +786,10 @@ class DocxConverter(HtmlConverter): def _encapsulate_omath(self, xml_content: str) -> str: def replace(match): quoted_omath = quote(match.group(0)) - return "$omml$ {} $/omml$".format(quoted_omath) + return "$formula$ {} $/formula$".format(quoted_omath) - xml_content = self._omath_pattern.sub(replace, xml_content) - xml_content = self._omath_para_pattern.sub(lambda m: m.group(1), xml_content) + xml_content = self._omath_re.sub(replace, xml_content) + xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content) return xml_content def _convert_omath_to_tex(self, html: str) -> str: @@ -771,16 +797,17 @@ class DocxConverter(HtmlConverter): omml_content = unquote(match.group(1)) return self._omml2tex(omml_content) - return self._omml_pattern.sub(replace, html) + return self._formula_re.sub(replace, html) def _encapsulate_equations(self, docx_filename: str) -> BytesIO: """ + Surrounds all OMML equations in the docx file with $formula$ and $/formula$ tags. Args: docx_filename: The path to the docx file to process. Returns: - docx file with OMML equations encapsulated in $omml$ and $/omml$ tags. + docx file with OMML equations encapsulated in $formula$ and $/formula$ tags. """ doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml") output_zip = BytesIO() From 3dd2f0a1186a816b3595e3bff494d5b440637b97 Mon Sep 17 00:00:00 2001 From: Marcos Romero Lamas Date: Sat, 1 Feb 2025 18:54:14 +0100 Subject: [PATCH 4/4] test: add test file --- src/markitdown/_markitdown.py | 41 +++++++++++++++------------------ tests/test_files/equation.docx | Bin 0 -> 3907 bytes tests/test_markitdown.py | 10 ++++++++ 3 files changed, 28 insertions(+), 23 deletions(-) create mode 100644 tests/test_files/equation.docx diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 9185b4e..82b6226 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -748,23 +748,23 @@ class DocxConverter(HtmlConverter): os.path.dirname(os.path.abspath(__file__))), 'xsl', ) - self._mml2tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") - self._omml2mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") + self._mml_to_tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") + self._omml_to_mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") - def _mml2tex(self, mml_xml: str) -> str: + def _mml_to_tex(self, mml_xml: str) -> str: tree = ET.fromstring(mml_xml) - transform = ET.XSLT(ET.parse(self._mml2tex_xsl)) + transform = ET.XSLT(ET.parse(self._mml_to_tex_xsl)) return str(transform(tree)) - def _omml2mml(self, formula_xml: str) -> str: + def _omml_to_mml(self, formula_xml: str) -> str: xml_content = self._template.safe_substitute(formula_xml=formula_xml) tree = ET.fromstring(xml_content) - transform = ET.XSLT(ET.parse(self._omml2mml_xsl)) + transform = ET.XSLT(ET.parse(self._omml_to_mml_xsl)) return str(transform(tree)) - def _omml2tex(self, omml_xml: str) -> str: - mml_xml = self._omml2mml(omml_xml) - return self._mml2tex(mml_xml) + def _omml_to_tex(self, omml_xml: str) -> str: + mml_xml = self._omml_to_mml(omml_xml) + return self._mml_to_tex(mml_xml) def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX @@ -774,16 +774,16 @@ class DocxConverter(HtmlConverter): result = None # preprocess docx equations in docx file - docx_file = self._encapsulate_equations(local_path) + docx_file = self._quote_equations(local_path) style_map = kwargs.get("style_map", None) result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = self._convert_omath_to_tex(result.value) + html_content = self._unquote_omath_to_tex(result.value) result = self._convert(html_content) return result - def _encapsulate_omath(self, xml_content: str) -> str: + def _quote_omath(self, xml_content: str) -> str: def replace(match): quoted_omath = quote(match.group(0)) return "$formula$ {} $/formula$".format(quoted_omath) @@ -792,22 +792,17 @@ class DocxConverter(HtmlConverter): xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content) return xml_content - def _convert_omath_to_tex(self, html: str) -> str: + def _unquote_omath_to_tex(self, html: str) -> str: def replace(match): omml_content = unquote(match.group(1)) - return self._omml2tex(omml_content) + return self._omml_to_tex(omml_content) return self._formula_re.sub(replace, html) - def _encapsulate_equations(self, docx_filename: str) -> BytesIO: + def _quote_equations(self, docx_filename: str) -> BytesIO: """ - Surrounds all OMML equations in the docx file with $formula$ and $/formula$ tags. - - Args: - docx_filename: The path to the docx file to process. - - Returns: - docx file with OMML equations encapsulated in $formula$ and $/formula$ tags. + Surrounds all OMML equations in the docx file with $formula$ and + $/formula$ tags. """ doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml") output_zip = BytesIO() @@ -818,7 +813,7 @@ class DocxConverter(HtmlConverter): if item.filename not in doc_files: z_out.writestr(item, z_in.read(item.filename)) else: - xml_content = self._encapsulate_omath( + xml_content = self._quote_omath( z_in.read(item.filename).decode("utf8") ).encode("utf8") z_out.writestr(item.filename, xml_content) diff --git a/tests/test_files/equation.docx b/tests/test_files/equation.docx new file mode 100644 index 0000000000000000000000000000000000000000..e61721a33187c6d4041cdf4f2501c3e798d8dcfb GIT binary patch literal 3907 zcmai1XH-+$)(u1mRRYoxlt`6cLzkwMP(l%e(3B!Ap($OOpfr&pNL2zNO$4M#2SYEB zrZg#+u5>}9NO+0&yPrJYyZ5a##u?|wnfshI_grhw1&0C8&;!WG$N&nm?==CZf)f91 z?P>4oB_d3?mn7=KfGSiEHjf$9Hl|z!BD=7A5LGwGH&A%m+QtADB;izYGOa>U2anF~ zZ1CIJIzNQO;;`Re{R%w}oF$W~d@UW11JE*5*exSTjrS_A{kF|yZdBJK^W`C~`J6aO zx&`(&bh-{&&BgTRV8+>YpTl7wvXud%OXK)_6^Q`=_}?-@hkwJz&C^cA!3}lC_~tEF zdtqOsYe|A43L{Dt@%@?Vz;stX{LHnri+lrxT>~#b*+@B_>gKk6E;Y`&NUR1>WWGJ2|~JpAf(Vdvv`3#1Bx zwdJe_A6oFgmX~Z{8_m@72_`vz0jl|D76-l2SJg*vEFA)#We@o#jj->5{5fipH=Y*Zje}VN1S}tLfo-E*va)lty^FL*-64?pr^Mp>Dc*^K!#D3mm-%E=gJ_*5 zV#pe`?}enIf<=S54Av&u;3Hum8jjmOdl@%-4_|WC=IkU5Qhk*KhMA|LhJg2N>#dR5 zuM@w9BtCT9p0`tsf=);2>;~#*{ox za-^?6dtO|3-Dax(e55stjf_!cZ&HU}aEMZ(t7Q`v^<9zl3<-KHf@7IsV>b^<+(&J( zD#Ez9Ed#6TfCL62kDuIni_b3}&%)m;i1sfAPUqfl+K-wPrE1#br^=G1D#;WUkh`R3 zP`RSVPW&POa5V*qPFc|M99kpypW-Oj7uWu(pU#L7qrwW`a!fo_GK|%EkKMe=P?9EX z(M_)#XG<@pb-lwrA*ZxvRO2?EcuH5SvGyB!WSYwI4zxlyk$-}W2YE(bGpT@H%ADsx ziVoO|OXfpQ!6yy_Zo1^<%r_?4(yI9dmDPHpsQYo(`?6KP8Wc~^B9|u(5Ic|D1Z^vc6580+Jw})N- zvECh0u8x9MxH|KuA;g;n75btXp611=XYYS32a2SvD+Dg{f2R$K357J%MxitF zR}v$Wm$*F1FZx*~5E&cQ~*E=d}7XOo_5a8mDqvRvizJmzt=3aG-##MYxxUb%B^^fp!B zzMt(YT+H+jpQo%x-ssN8xo7rm*CLt&4wjr&%L2}=H>z|r<7nI&l!~#q>oWdYR{lok zw=T$>U7E+d2WI2`*g877<}3Aww-se>*1SD)RVaGu<1rcEc%x-}m@LKY4Z&vlXwl_h zg^Z3kVI53TQj*gr*-v2h+VLTC617=7)&|c!!TTOl0#|K8d3Ic@$dN z)c>S(n!l9pb;r-u-iuJM_6q4xLQY6K#^n;gbNp{asKkHG*J(x#L0}E)<~I& z5X}&nirSl;^vtX#d9%1GGmj$nPa?a%EQuqjfopYgdQPN=%1g$xH4(-v%$Y^Q+bxsOJe8!O>ryUVuCO8 zZC;jA>V%aa-}*uF8%!d%r)=^&zHV53sHW*9)I2?1ktzgS@+cF=o|v26mvg84^Sc`l zH+J+a7iv}EN${+0Q z3@d=$R7`-rh!sm4Nlhx}AjoAjD`?oa5hvy*Z#dwP!xFv#Vwj--UrF1EXxEmLFc;S0 zRdsU}N_xcow5S8Fp+WVyN{Gz4*BFXCuboN@8xSgJ{bncs*jjWy@LL-%!~Mxs3X7!wbb@8J(yw%%d|tl)=ElzU>cUL_iD}Bs!BjW4a-BX2bK|YZk_8@ ztsF?HO*icwm_kjT$RU&E9Qo?7hOS$G2g|xs zrhz_I!%{}%17o+{x6xP4*LIcC1F~f|!W^)berV)1q-Gp6zES{fwrRCUCTD42)K$WF z$aHC}srgHC@ z85A8K0G)(DxHBM{Za7(W#fNs))Ld!tEU=?>T?Ui)95VqD%`DHRv%Khd@f_=TD#jWr zZ!_PL)%n`LVfx(=Tu*25jyS0S(4AE~SV}D*8K<0cjzafqmqm?s!WC6-$=$cRb{#3~ zKI3O8mSz2Y)lbsUWfQUUSz!`t-cCU3?QBWCCfBd7JTy1jjSG-n;y-*4E)nx|GieTp82GXoY0_MkDnj66kAcBr z2%jzDe;$MX&xfs>=T9H?C=sELxyVc~nAG2e%05f#PI5bYFVda6EjdlQUrpP~8M3;! znRmAx(;1gLPRLb8<=quZkgO*C0l0kSb&{xt; z_%aHHICMY023!RUXk1GIi9|vCS3Kv*er-uP%zV z5@Q-gYq}%jv|E#gg5G+A$>Pu?Byzqwl+n#*+?dvPvVPDkn))nu$td0HL?Y%4sLTpUx*Tekj zy1GoC$ha$$aepR?y0i<%c8z7UL2wD9PxjW$pkPjAdhPc1#Kp7&hjaDjsJ_$^&TQ{$ zhy1JE;$=hp$fNYBs?h@ygmg^Nlz&TXC8LXU+lq*+hHMD!4H=pgJvpY>$VzI(@2X1C z9Q$Tw&%8dno?+Wb6~AFLMJMb?GOiu&dx>f!b_T?pz~p2<{=rVOFP(f-L+I*@pg`mY zkU36U8=FN6THg*o6C14S@2#>OM^@s?jMnV2CbArLBJb(uRZD@iAIx%H9O9L*71P)h z66;ssP53dh6ilPIvA+{VJF&4I>LE|e(cJ#zi2YeZ@vMR;LqF4*#kD95a5$pEjHXWBn@(@hbR<{~MeA`%0$|bb>+s6~_34f39@euKo@`-2w@x z{;yCX{Xh6G5AZwqbiW{U$zK5@|IeQJJNmTl|NYMq6sORCb(r7br}vdm9KT`^zm@-` YRNycYd`bWS8U8xP?~*9q0s#R30)%r}JOBUy literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 689d6f3..300ced3 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -272,6 +272,10 @@ def test_markitdown_local() -> None: assert "# Test" in result.text_content + + + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", @@ -358,6 +362,12 @@ def test_markitdown_llm() -> None: for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() +def test_equation() -> None: + expected_string = r'${\left(x+a\right)}^{n}={\sum }\_{k=0}^{n}\left(\begin{array}{c}n\\ k\end{array}\right){x}^{k}{a}^{n-k}$' + markitdown = MarkItDown() + result = markitdown.convert( os.path.join(TEST_FILES_DIR, "equation.docx")) + assert expected_string == result.text_content.strip() + if __name__ == "__main__": """Runs this file's tests from the command line."""