From 5f2af03aebf01ea8ec4e3e662c1a28b94e4e8e8a Mon Sep 17 00:00:00 2001 From: Sathindu Ganhala Arachchige Date: Thu, 27 Mar 2025 14:20:21 -0400 Subject: [PATCH] feat: math equation rendering in .docx files --- packages/markitdown/pyproject.toml | 2 +- .../markitdown/converter_utils/__init__.py | 0 .../converter_utils/docx/__init__.py | 0 .../converter_utils/docx/math/__init__.py | 0 .../converter_utils/docx/math/latex_dict.py | 275 +++++++++++++ .../converter_utils/docx/math/omml.py | 362 ++++++++++++++++++ .../converter_utils/docx/pre_process.py | 156 ++++++++ .../markitdown/converters/_docx_converter.py | 4 +- 8 files changed, 797 insertions(+), 2 deletions(-) create mode 100644 packages/markitdown/src/markitdown/converter_utils/__init__.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/__init__.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 9136108..d5754db 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -47,7 +47,7 @@ all = [ "azure-identity" ] pptx = ["python-pptx"] -docx = ["mammoth"] +docx = ["mammoth", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] diff --git a/packages/markitdown/src/markitdown/converter_utils/__init__.py b/packages/markitdown/src/markitdown/converter_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py new file mode 100644 index 0000000..5b94bb4 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py @@ -0,0 +1,275 @@ +# -*- coding: utf-8 -*- + +""" +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py +On 25/03/2025 +""" + +from __future__ import unicode_literals + +CHARS = ('{','}', '_', '^', '#', '&', '$', '%', '~') + +BLANK = '' +BACKSLASH = '\\' +ALN = '&' + +CHR = { + #Unicode : Latex Math Symbols + #Top accents + '\u0300' : '\\grave{{{0}}}', + '\u0301' : '\\acute{{{0}}}', + '\u0302' : '\\hat{{{0}}}', + '\u0303' : '\\tilde{{{0}}}', + '\u0304' : '\\bar{{{0}}}', + '\u0305' : '\\overbar{{{0}}}', + '\u0306' : '\\breve{{{0}}}', + '\u0307' : '\\dot{{{0}}}', + '\u0308' : '\\ddot{{{0}}}', + '\u0309' : '\\ovhook{{{0}}}', + '\u030a' : '\\ocirc{{{0}}}}', + '\u030c' : '\\check{{{0}}}}', + '\u0310' : '\\candra{{{0}}}', + '\u0312' : '\\oturnedcomma{{{0}}}', + '\u0315' : '\\ocommatopright{{{0}}}', + '\u031a' : '\\droang{{{0}}}', + '\u0338' : '\\not{{{0}}}', + '\u20d0' : '\\leftharpoonaccent{{{0}}}', + '\u20d1' : '\\rightharpoonaccent{{{0}}}', + '\u20d2' : '\\vertoverlay{{{0}}}', + '\u20d6' : '\\overleftarrow{{{0}}}', + '\u20d7' : '\\vec{{{0}}}', + '\u20db' : '\\dddot{{{0}}}', + '\u20dc' : '\\ddddot{{{0}}}', + '\u20e1' : '\\overleftrightarrow{{{0}}}', + '\u20e7' : '\\annuity{{{0}}}', + '\u20e9' : '\\widebridgeabove{{{0}}}', + '\u20f0' : '\\asteraccent{{{0}}}', + #Bottom accents + '\u0330' : '\\wideutilde{{{0}}}', + '\u0331' : '\\underbar{{{0}}}', + '\u20e8' : '\\threeunderdot{{{0}}}', + '\u20ec' : '\\underrightharpoondown{{{0}}}', + '\u20ed' : '\\underleftharpoondown{{{0}}}', + '\u20ee' : '\\underledtarrow{{{0}}}', + '\u20ef' : '\\underrightarrow{{{0}}}', + #Over | group + '\u23b4' : '\\overbracket{{{0}}}', + '\u23dc' : '\\overparen{{{0}}}', + '\u23de' : '\\overbrace{{{0}}}', + #Under| group + '\u23b5' : '\\underbracket{{{0}}}', + '\u23dd' : '\\underparen{{{0}}}', + '\u23df' : '\\underbrace{{{0}}}', +} + +CHR_BO = { + #Big operators, + '\u2140' : '\\Bbbsum', + '\u220f' : '\\prod', + '\u2210' : '\\coprod', + '\u2211' : '\\sum', + '\u222b' : '\\int', + '\u22c0' : '\\bigwedge', + '\u22c1' : '\\bigvee', + '\u22c2' : '\\bigcap', + '\u22c3' : '\\bigcup', + '\u2a00' : '\\bigodot', + '\u2a01' : '\\bigoplus', + '\u2a02' : '\\bigotimes', +} + +T = { + + '\u2192' : '\\rightarrow ', + #Greek letters + '\U0001d6fc' : '\\alpha ', + '\U0001d6fd' : '\\beta ', + '\U0001d6fe' : '\\gamma ', + '\U0001d6ff' : '\\theta ', + '\U0001d700' : '\\epsilon ', + '\U0001d701' : '\\zeta ', + '\U0001d702' : '\\eta ', + '\U0001d703' : '\\theta ', + '\U0001d704' : '\\iota ', + '\U0001d705' : '\\kappa ', + '\U0001d706' : '\\lambda ', + '\U0001d707' : '\\m ', + '\U0001d708' : '\\n ', + '\U0001d709' : '\\xi ', + '\U0001d70a' : '\\omicron ', + '\U0001d70b' : '\\pi ', + '\U0001d70c' : '\\rho ', + '\U0001d70d' : '\\varsigma ', + '\U0001d70e' : '\\sigma ', + '\U0001d70f' : '\\ta ', + '\U0001d710' : '\\upsilon ', + '\U0001d711' : '\\phi ', + '\U0001d712' : '\\chi ', + '\U0001d713' : '\\psi ', + '\U0001d714' : '\\omega ', + '\U0001d715' : '\\partial ', + '\U0001d716' : '\\varepsilon ', + '\U0001d717' : '\\vartheta ', + '\U0001d718' : '\\varkappa ', + '\U0001d719' : '\\varphi ', + '\U0001d71a' : '\\varrho ', + '\U0001d71b' : '\\varpi ', + #Relation symbols + '\u2190' : '\\leftarrow ', + '\u2191' : '\\uparrow ', + '\u2192' : '\\rightarrow ', + '\u2193' : '\\downright ', + '\u2194' : '\\leftrightarrow ', + '\u2195' : '\\updownarrow ', + '\u2196' : '\\nwarrow ', + '\u2197' : '\\nearrow ', + '\u2198' : '\\searrow ', + '\u2199' : '\\swarrow ', + '\u22ee' : '\\vdots ', + '\u22ef' : '\\cdots ', + '\u22f0' : '\\adots ', + '\u22f1' : '\\ddots ', + '\u2260' : '\\ne ', + '\u2264' : '\\leq ', + '\u2265' : '\\geq ', + '\u2266' : '\\leqq ', + '\u2267' : '\\geqq ', + '\u2268' : '\\lneqq ', + '\u2269' : '\\gneqq ', + '\u226a' : '\\ll ', + '\u226b' : '\\gg ', + '\u2208' : '\\in ', + '\u2209' : '\\notin ', + '\u220b' : '\\ni ', + '\u220c' : '\\nni ', + + #Ordinary symbols + '\u221e' : '\\infty ', + #Binary relations + '\u00b1' : '\\pm ', + '\u2213' : '\\mp ', + #Italic, Latin, uppercase + '\U0001d434' : 'A', + '\U0001d435' : 'B', + '\U0001d436' : 'C', + '\U0001d437' : 'D', + '\U0001d438' : 'E', + '\U0001d439' : 'F', + '\U0001d43a' : 'G', + '\U0001d43b' : 'H', + '\U0001d43c' : 'I', + '\U0001d43d' : 'J', + '\U0001d43e' : 'K', + '\U0001d43f' : 'L', + '\U0001d440' : 'M', + '\U0001d441' : 'N', + '\U0001d442' : 'O', + '\U0001d443' : 'P', + '\U0001d444' : 'Q', + '\U0001d445' : 'R', + '\U0001d446' : 'S', + '\U0001d447' : 'T', + '\U0001d448' : 'U', + '\U0001d449' : 'V', + '\U0001d44a' : 'W', + '\U0001d44b' : 'X', + '\U0001d44c' : 'Y', + '\U0001d44d' : 'Z', + #Italic, Latin, lowercase + '\U0001d44e' : 'a', + '\U0001d44f' : 'b', + '\U0001d450' : 'c', + '\U0001d451' : 'd', + '\U0001d452' : 'e', + '\U0001d453' : 'f', + '\U0001d454' : 'g', + '\U0001d456' : 'i', + '\U0001d457' : 'j', + '\U0001d458' : 'k', + '\U0001d459' : 'l', + '\U0001d45a' : 'm', + '\U0001d45b' : 'n', + '\U0001d45c' : 'o', + '\U0001d45d' : 'p', + '\U0001d45e' : 'q', + '\U0001d45f' : 'r', + '\U0001d460' : 's', + '\U0001d461' : 't', + '\U0001d462' : 'u', + '\U0001d463' : 'v', + '\U0001d464' : 'w', + '\U0001d465' : 'x', + '\U0001d466' : 'y', + '\U0001d467' : 'z', +} + +FUNC ={ + 'sin' : '\\sin({fe})', + 'cos' : '\\cos({fe})', + 'tan' : '\\tan({fe})', + 'arcsin' : '\\arcsin({fe})', + 'arccos' : '\\arccos({fe})', + 'arctan' : '\\arctan({fe})', + 'arccot' : '\\arccot({fe})', + 'sinh' : '\\sinh({fe})', + 'cosh' : '\\cosh({fe})', + 'tanh' : '\\tanh({fe})', + 'coth' : '\\coth({fe})', + 'sec' : '\\sec({fe})', + 'csc' : '\\csc({fe})', +} + +FUNC_PLACE = '{fe}' + +BRK = '\\\\' + +CHR_DEFAULT = { + 'ACC_VAL':'\\hat{{{0}}}', +} + +POS = { + 'top' : '\\overline{{{0}}}', # not sure + 'bot' : '\\underline{{{0}}}', +} + +POS_DEFAULT = { + 'BAR_VAL': '\\overline{{{0}}}', +} + +SUB = '_{{{0}}}' + +SUP = '^{{{0}}}' + +F = { + 'bar': '\\frac{{{num}}}{{{den}}}', + 'skw': r'^{{{num}}}/_{{{den}}}', + 'noBar': '\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}', + 'lin' : '{{{num}}}/{{{den}}}', +} +F_DEFAULT = '\\frac{{{num}}}{{{den}}}' + +D = '\\left{left}{text}\\right{right}' + +D_DEFAULT = { + 'left':'(', + 'right':')', + 'null':'.', +} + +RAD = '\\sqrt[{deg}]{{{text}}}' + +RAD_DEFAULT = '\\sqrt{{{text}}}' + +ARR = '\\begin{{array}}{{c}}{text}\end{{array}}' + +LIM_FUNC = { + 'lim':'\\lim_{{{lim}}}', + 'max':'\\max_{{{lim}}}', + 'min':'\\min_{{{lim}}}', +} + +LIM_TO = ('\\rightarrow','\\to') + +LIM_UPP = '\\overset{{{lim}}}{{{text}}}' + +M = '\\begin{{matrix}}{text}\end{{matrix}}' diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py new file mode 100644 index 0000000..3767932 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -0,0 +1,362 @@ +# -*- coding: utf-8 -*- + +""" +Office Math Markup Language (OMML) +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py +On 25/03/2025 +""" + +import xml.etree.ElementTree as ET + +from experiment.math.latex_dict import (CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT + , SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR + , LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE) + +OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" + + +def load(stream): + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS+'oMath'): + yield oMath2Latex(omath) + +def load_string(string): + root = ET.fromstring(string) + for omath in root.findall(OMML_NS+'oMath'): + yield oMath2Latex(omath) + +def escape_latex(strs): + last = None + new_chr = [] + strs = strs.replace(r'\\','\\') + for c in strs : + if (c in CHARS) and (last !=BACKSLASH): + new_chr.append(BACKSLASH+c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) + +def get_val(key,default=None,store=CHR): + if key is not None: + return key if not store else store.get(key,key) + else: + return default + + +class Tag2Method(object): + + def call_method(self,elm,stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS,'') + method = getmethod(stag) + if method: + return method(self,elm) + else: + return None + + def process_children_list(self,elm,include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if (OMML_NS not in _e.tag): + continue + stag = _e.tag.replace(OMML_NS,'') + if include and (stag not in include): + continue + t = self.call_method(_e,stag=stag) + if t is None: + t = self.process_unknow(_e,stag) + if t is None: + continue + yield (stag,t,_e) + + def process_children_dict(self,elm,include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag,t,e in self.process_children_list(elm,include): + latex_chars[stag] = t + return latex_chars + + def process_children(self,elm,include=None): + """ + process children of the elm,return string + """ + return BLANK.join(( t if not isinstance(t,Tag2Method) else str(t) + for stag,t,e in self.process_children_list(elm,include))) + + def process_unknow(self,elm,stag): + return None + + +class Pr(Tag2Method): + + text = '' + + __val_tags = ('chr','pos','begChr','endChr','type') + + __innerdict= None #can't use the __dict__ + + """ common properties of element""" + def __init__(self, elm): + self.__innerdict={} + self.text=self.process_children(elm) + + def __str__(self): + return self.text + + def __unicode__(self): + return self.__str__(self) + + def __getattr__(self,name): + return self.__innerdict.get(name,None) + + def do_brk(self,elm): + self.__innerdict['brk'] = BRK + return BRK + + def do_common(self,elm): + stag = elm.tag.replace(OMML_NS,'') + if stag in self.__val_tags: + t = elm.get('{0}val'.format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + 'brk':do_brk, + 'chr':do_common, + 'pos':do_common, + 'begChr':do_common, + 'endChr':do_common, + 'type':do_common, + } + + +class oMath2Latex(Tag2Method): + """ + Convert oMath element of omml to latex + """ + _t_dict = T + + __direct_tags = ('box','sSub','sSup','sSubSup','num','den','deg','e') + + def __init__(self, element): + self._latex = self.process_children(element) + + def __str__(self): + return self.latex + + def __unicode__(self): + return self.__str__(self) + + def process_unknow(self,elm,stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == 'Pr': + return Pr(elm) + else: + return None + + @property + def latex(self): + return self._latex + + def do_acc(self,elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val(c_dict['accPr'].chr,default=CHR_DEFAULT.get('ACC_VAL'),store=CHR) + return latex_s.format(c_dict['e']) + + def do_bar(self,elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict['barPr'] + latex_s = get_val(pr.pos,default=POS_DEFAULT.get('BAR_VAL'),store=POS) + return pr.text+latex_s.format(c_dict['e']) + + def do_d(self,elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict['dPr'] + null = D_DEFAULT.get('null') + s_val = get_val(pr.begChr,default=D_DEFAULT.get('left'),store=T) + e_val = get_val(pr.endChr,default=D_DEFAULT.get('right'),store=T) + return pr.text+D.format(left= null if not s_val else escape_latex(s_val), + text=c_dict['e'], + right= null if not e_val else escape_latex(e_val)) + + + def do_spre(self,elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass + + def do_sub(self,elm): + text = self.process_children(elm) + return SUB.format(text) + + def do_sup(self,elm): + text = self.process_children(elm) + return SUP.format(text) + + def do_f(self,elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict['fPr'] + latex_s = get_val(pr.type,default=F_DEFAULT,store=F) + return pr.text+latex_s.format(num=c_dict.get('num'),den=c_dict.get('den')) + + def do_func(self,elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get('fName') + return func_name.replace(FUNC_PLACE,c_dict.get('e')) + + def do_fname(self,elm): + """ + the func name + """ + latex_chars = [] + for stag,t,e in self.process_children_list(elm): + if stag == 'r': + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else : + raise NotImplemented("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t+FUNC_PLACE #do_func will replace this + + def do_groupchr(self,elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict['groupChrPr'] + latex_s = get_val(pr.chr) + return pr.text+latex_s.format(c_dict['e']) + + def do_rad(self,elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get('e') + deg_text = c_dict.get('deg') + if deg_text: + return RAD.format(deg=deg_text,text=text) + else: + return RAD_DEFAULT.format(text=text) + + def do_eqarr(self,elm): + """ + the Array object + """ + return ARR.format(text=BRK.join( + [t for stag,t,e in self.process_children_list(elm,include=('e',))])) + + + def do_limlow(self,elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm,include=('e','lim')) + latex_s = LIM_FUNC.get(t_dict['e']) + if not latex_s : + raise NotImplemented("Not support lim %s" % t_dict['e']) + else: + return latex_s.format(lim=t_dict.get('lim')) + + def do_limupp(self,elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm,include=('e','lim')) + return LIM_UPP.format(lim=t_dict.get('lim'),text=t_dict.get('e')) + + def do_lim(self,elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0],LIM_TO[1]) + + def do_m(self,elm): + """ + the Matrix object + """ + rows = [] + for stag,t,e in self.process_children_list(elm): + if stag is 'mPr': + pass + elif stag == 'mr': + rows.append(t) + return M.format(text=BRK.join(rows)) + + def do_mr(self,elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag,t,e in self.process_children_list(elm,include=('e',))]) + + def do_nary(self,elm): + """ + the n-ary object + """ + res = [] + bo = '' + for stag,t,e in self.process_children_list(elm): + if stag == 'naryPr': + bo = get_val(t.chr,store=CHR_BO) + else : + res.append(t) + return bo+BLANK.join(res) + + def do_r(self,elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + for s in elm.findtext('./{0}t'.format(OMML_NS)): + #s = s if isinstance(s,unicode) else unicode(s,'utf-8') + _str.append(self._t_dict.get(s,s)) + return escape_latex(BLANK.join(_str)) + + tag2meth={ + 'acc' : do_acc, + 'r' : do_r, + 'bar' : do_bar, + 'sub' : do_sub, + 'sup' : do_sup, + 'f' : do_f, + 'func': do_func, + 'fName' : do_fname, + 'groupChr' : do_groupchr, + 'd' : do_d, + 'rad' : do_rad, + 'eqArr' : do_eqarr, + 'limLow' : do_limlow, + 'limUpp' : do_limupp, + 'lim' : do_lim, + 'm' : do_m, + 'mr' : do_mr, + 'nary' : do_nary, + } diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py new file mode 100644 index 0000000..78552bc --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -0,0 +1,156 @@ +import zipfile +from io import BytesIO +from typing import BinaryIO +from xml.etree import ElementTree as ET + +from bs4 import BeautifulSoup, Tag + +from .math.omml import OMML_NS, oMath2Latex + +MATH_ROOT_TEMPLATE = "".join( + ( + "', + "{0}", + ) +) + + +def _convert_omath_to_latex(tag: Tag) -> str: + """ + Converts an OMML (Office Math Markup Language) tag to LaTeX format. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the OMML element. + + Returns: + str: The LaTeX representation of the OMML element. + """ + # Format the tag into a complete XML document string + math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag))) + # Find the 'oMath' element within the XML document + math_element = math_root.find(OMML_NS + "oMath") + # Convert the 'oMath' element to LaTeX using the oMath2Latex function + latex = oMath2Latex(math_element).latex + return latex + + +def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag: + """ + Creates a replacement tag for an OMML (Office Math Markup Language) element. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the "oMath" element. + block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False. + + Returns: + Tag: A BeautifulSoup Tag object representing the replacement element. + """ + t_tag = Tag(name="w:t") + t_tag.string = ( + f"$${_convert_omath_to_latex(tag)}$$" + if block + else f"${_convert_omath_to_latex(tag)}$" + ) + r_tag = Tag(name="w:r") + r_tag.append(t_tag) + return r_tag + + +def _replace_equations(tag: Tag): + """ + Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath". + + Raises: + ValueError: If the tag is not supported. + """ + if tag.name == "oMathPara": + # Create a new paragraph tag + p_tag = Tag(name="w:p") + # Replace each 'oMath' child tag with its LaTeX equivalent as block equations + for child_tag in tag.find_all("oMath"): + p_tag.append(_get_omath_tag_replacement(child_tag, block=True)) + # Replace the original 'oMathPara' tag with the new paragraph tag + tag.replace_with(p_tag) + elif tag.name == "oMath": + # Replace the 'oMath' tag with its LaTeX equivalent as inline equation + tag.replace_with(_get_omath_tag_replacement(tag, block=False)) + else: + raise ValueError(f"Not supported tag: {tag.name}") + + +def _pre_process_math(content: bytes) -> bytes: + """ + Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX. + This preprocessed content can be directly replaced in the DOCX file -> XMLs. + + Args: + content (bytes): The XML content of the DOCX file as bytes. + + Returns: + bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. + """ + soup = BeautifulSoup(content.decode(), features="xml") + for tag in soup.find_all("oMathPara"): + _replace_equations(tag) + for tag in soup.find_all("oMath"): + _replace_equations(tag) + return str(soup).encode() + + +def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: + """ + Pre-processes a DOCX file with provided steps. + + The process works by unzipping the DOCX file in memory, transforming specific XML files + (such as converting OMML elements to LaTeX), and then zipping everything back into a + DOCX file without writing to disk. + + Args: + input_docx (BinaryIO): A binary input stream representing the DOCX file. + + Returns: + BinaryIO: A binary output stream representing the processed DOCX file. + """ + output_docx = BytesIO() + # The files that need to be pre-processed from .docx + pre_process_enable_files = [ + "word/document.xml", + "word/footnotes.xml", + "word/endnotes.xml", + ] + with zipfile.ZipFile(input_docx, mode="r") as zip_input: + files = {name: zip_input.read(name) for name in zip_input.namelist()} + with zipfile.ZipFile(output_docx, mode="w") as zip_output: + zip_output.comment = zip_input.comment + for name, content in files.items(): + if name in pre_process_enable_files: + try: + # Pre-process the content + updated_content = _pre_process_math(content) + # In the future, if there are more pre-processing steps, they can be added here + zip_output.writestr(name, updated_content) + except: + # If there is an error in processing the content, write the original content + zip_output.writestr(name, content) + else: + zip_output.writestr(name, content) + output_docx.seek(0) + return output_docx diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index a9c469f..6b30094 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -3,6 +3,7 @@ import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter +from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -72,6 +73,7 @@ class DocxConverter(HtmlConverter): ) style_map = kwargs.get("style_map", None) + pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( - mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs )