feat: math equation rendering in .docx files

2025-03-27 14:20:21 -04:00 · 2025-03-27 14:20:21 -04:00 · 5f2af03aeb
commit 5f2af03aeb
parent 2ffe6ea591
8 changed files with 797 additions and 2 deletions
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@ -47,7 +47,7 @@ all = [
  "azure-identity"
 ]
 pptx = ["python-pptx"]
-docx = ["mammoth"]
+docx = ["mammoth", "lxml"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
 pdf = ["pdfminer.six"]
--- a/packages/markitdown/src/markitdown/converter_utils/init.py
+++ b/packages/markitdown/src/markitdown/converter_utils/init.py
--- a/packages/markitdown/src/markitdown/converter_utils/docx/init.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/init.py
--- a/packages/markitdown/src/markitdown/converter_utils/docx/math/init.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/init.py
--- a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py
@ -0,0 +1,275 @@
+# -*- coding: utf-8 -*-
+
+"""
+Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
+On 25/03/2025
+"""
+
+from __future__ import unicode_literals
+
+CHARS = ('{','}', '_', '^', '#', '&', '$', '%', '~')
+
+BLANK = ''
+BACKSLASH = '\\'
+ALN = '&'
+
+CHR = {
+	#Unicode : Latex Math Symbols
+	#Top accents
+	'\u0300' : '\\grave{{{0}}}',
+	'\u0301' : '\\acute{{{0}}}',
+	'\u0302' : '\\hat{{{0}}}',
+	'\u0303' : '\\tilde{{{0}}}',
+	'\u0304' : '\\bar{{{0}}}',
+	'\u0305' : '\\overbar{{{0}}}',
+	'\u0306' : '\\breve{{{0}}}',
+	'\u0307' : '\\dot{{{0}}}',
+	'\u0308' : '\\ddot{{{0}}}',
+	'\u0309' : '\\ovhook{{{0}}}',
+	'\u030a' : '\\ocirc{{{0}}}}',
+	'\u030c' : '\\check{{{0}}}}',
+	'\u0310' : '\\candra{{{0}}}',
+	'\u0312' : '\\oturnedcomma{{{0}}}',
+	'\u0315' : '\\ocommatopright{{{0}}}',
+	'\u031a' : '\\droang{{{0}}}',
+	'\u0338' : '\\not{{{0}}}',
+	'\u20d0' : '\\leftharpoonaccent{{{0}}}',
+	'\u20d1' : '\\rightharpoonaccent{{{0}}}',
+	'\u20d2' : '\\vertoverlay{{{0}}}',
+	'\u20d6' : '\\overleftarrow{{{0}}}',
+	'\u20d7' : '\\vec{{{0}}}',
+	'\u20db' : '\\dddot{{{0}}}',
+	'\u20dc' : '\\ddddot{{{0}}}',
+	'\u20e1' : '\\overleftrightarrow{{{0}}}',
+	'\u20e7' : '\\annuity{{{0}}}',
+	'\u20e9' : '\\widebridgeabove{{{0}}}',
+	'\u20f0' : '\\asteraccent{{{0}}}',
+	 #Bottom accents
+	'\u0330' : '\\wideutilde{{{0}}}',
+	'\u0331' : '\\underbar{{{0}}}',
+	'\u20e8' : '\\threeunderdot{{{0}}}',
+	'\u20ec' : '\\underrightharpoondown{{{0}}}',
+	'\u20ed' : '\\underleftharpoondown{{{0}}}',
+	'\u20ee' : '\\underledtarrow{{{0}}}',
+	'\u20ef' : '\\underrightarrow{{{0}}}',
+	#Over | group
+	'\u23b4' : '\\overbracket{{{0}}}',
+	'\u23dc' : '\\overparen{{{0}}}',
+	'\u23de' : '\\overbrace{{{0}}}',
+	#Under| group
+	'\u23b5' : '\\underbracket{{{0}}}',
+	'\u23dd' : '\\underparen{{{0}}}',
+	'\u23df' : '\\underbrace{{{0}}}',
+}
+
+CHR_BO = {
+	#Big operators,
+	'\u2140' : '\\Bbbsum',
+	'\u220f' : '\\prod',
+	'\u2210' : '\\coprod',
+	'\u2211' : '\\sum',
+	'\u222b' : '\\int',
+	'\u22c0' : '\\bigwedge',
+	'\u22c1' : '\\bigvee',
+	'\u22c2' : '\\bigcap',
+	'\u22c3' : '\\bigcup',
+	'\u2a00' : '\\bigodot',
+	'\u2a01' : '\\bigoplus',
+	'\u2a02' : '\\bigotimes',
+}
+
+T = {
+
+	'\u2192' : '\\rightarrow ',
+	#Greek letters
+	'\U0001d6fc' : '\\alpha ',
+	'\U0001d6fd' : '\\beta ',
+	'\U0001d6fe' : '\\gamma ',
+	'\U0001d6ff' : '\\theta ',
+	'\U0001d700' : '\\epsilon ',
+	'\U0001d701' : '\\zeta ',
+	'\U0001d702' : '\\eta ',
+	'\U0001d703' : '\\theta ',
+	'\U0001d704' : '\\iota ',
+	'\U0001d705' : '\\kappa ',
+	'\U0001d706' : '\\lambda ',
+	'\U0001d707' : '\\m ',
+	'\U0001d708' : '\\n ',
+	'\U0001d709' : '\\xi ',
+	'\U0001d70a' : '\\omicron ',
+	'\U0001d70b' : '\\pi ',
+	'\U0001d70c' : '\\rho ',
+	'\U0001d70d' : '\\varsigma ',
+	'\U0001d70e' : '\\sigma ',
+	'\U0001d70f' : '\\ta ',
+	'\U0001d710' : '\\upsilon ',
+	'\U0001d711' : '\\phi ',
+	'\U0001d712' : '\\chi ',
+	'\U0001d713' : '\\psi ',
+	'\U0001d714' : '\\omega ',
+	'\U0001d715' : '\\partial ',
+	'\U0001d716' : '\\varepsilon ',
+	'\U0001d717' : '\\vartheta ',
+	'\U0001d718' : '\\varkappa ',
+	'\U0001d719' : '\\varphi ',
+	'\U0001d71a' : '\\varrho ',
+	'\U0001d71b' : '\\varpi ',
+	#Relation symbols
+	'\u2190' : '\\leftarrow ',
+	'\u2191' : '\\uparrow ',
+	'\u2192' : '\\rightarrow ',
+	'\u2193' : '\\downright ',
+	'\u2194' : '\\leftrightarrow ',
+	'\u2195' : '\\updownarrow ',
+	'\u2196' : '\\nwarrow ',
+	'\u2197' : '\\nearrow ',
+	'\u2198' : '\\searrow ',
+	'\u2199' : '\\swarrow ',
+	'\u22ee' : '\\vdots ',
+	'\u22ef' : '\\cdots ',
+	'\u22f0' : '\\adots ',
+	'\u22f1' : '\\ddots ',
+	'\u2260' : '\\ne ',
+	'\u2264' : '\\leq ',
+	'\u2265' : '\\geq ',
+	'\u2266' : '\\leqq ',
+	'\u2267' : '\\geqq ',
+	'\u2268' : '\\lneqq ',
+	'\u2269' : '\\gneqq ',
+	'\u226a' : '\\ll ',
+	'\u226b' : '\\gg ',
+	'\u2208' : '\\in ',
+	'\u2209' : '\\notin ',
+	'\u220b' : '\\ni ',
+	'\u220c' : '\\nni ',
+
+	#Ordinary symbols
+	'\u221e' : '\\infty ',
+	#Binary relations
+	'\u00b1' : '\\pm ',
+	'\u2213' : '\\mp ',
+	#Italic, Latin, uppercase
+	'\U0001d434' : 'A',
+	'\U0001d435' : 'B',
+	'\U0001d436' : 'C',
+	'\U0001d437' : 'D',
+	'\U0001d438' : 'E',
+	'\U0001d439' : 'F',
+	'\U0001d43a' : 'G',
+	'\U0001d43b' : 'H',
+	'\U0001d43c'  : 'I',
+	'\U0001d43d'  : 'J',
+	'\U0001d43e'  : 'K',
+	'\U0001d43f'   : 'L',
+	'\U0001d440'  : 'M',
+	'\U0001d441'  : 'N',
+	'\U0001d442'  : 'O',
+	'\U0001d443'  : 'P',
+	'\U0001d444'  : 'Q',
+	'\U0001d445'  : 'R',
+	'\U0001d446'  : 'S',
+	'\U0001d447'  : 'T',
+	'\U0001d448'  : 'U',
+	'\U0001d449'  : 'V',
+	'\U0001d44a'  : 'W',
+	'\U0001d44b'  : 'X',
+	'\U0001d44c'  : 'Y',
+	'\U0001d44d'  : 'Z',
+	#Italic, Latin, lowercase
+	'\U0001d44e'  : 'a',
+	'\U0001d44f'   : 'b',
+	'\U0001d450'  : 'c',
+	'\U0001d451'  : 'd',
+	'\U0001d452'  : 'e',
+	'\U0001d453'  : 'f',
+	'\U0001d454'  : 'g',
+	'\U0001d456'  : 'i',
+	'\U0001d457'  : 'j',
+	'\U0001d458'  : 'k',
+	'\U0001d459'  : 'l',
+	'\U0001d45a'  : 'm',
+	'\U0001d45b'  : 'n',
+	'\U0001d45c'   : 'o',
+	'\U0001d45d'  : 'p',
+	'\U0001d45e'  : 'q',
+	'\U0001d45f'   : 'r',
+	'\U0001d460'  : 's',
+	'\U0001d461'  : 't',
+	'\U0001d462'  : 'u',
+	'\U0001d463'  : 'v',
+	'\U0001d464'  : 'w',
+	'\U0001d465'  : 'x',
+	'\U0001d466'  : 'y',
+	'\U0001d467'  : 'z',
+}
+
+FUNC ={
+	'sin' : '\\sin({fe})',
+	'cos' : '\\cos({fe})',
+	'tan' : '\\tan({fe})',
+	'arcsin' : '\\arcsin({fe})',
+	'arccos' : '\\arccos({fe})',
+	'arctan' : '\\arctan({fe})',
+	'arccot' : '\\arccot({fe})',
+	'sinh' : '\\sinh({fe})',
+	'cosh' : '\\cosh({fe})',
+	'tanh' : '\\tanh({fe})',
+	'coth' : '\\coth({fe})',
+	'sec'  : '\\sec({fe})',
+	'csc'  : '\\csc({fe})',
+}
+
+FUNC_PLACE = '{fe}'
+
+BRK = '\\\\'
+
+CHR_DEFAULT = {
+	'ACC_VAL':'\\hat{{{0}}}',
+}
+
+POS = {
+	'top' : '\\overline{{{0}}}', # not sure
+	'bot' : '\\underline{{{0}}}',
+}
+
+POS_DEFAULT = {
+	'BAR_VAL': '\\overline{{{0}}}',
+}
+
+SUB = '_{{{0}}}'
+
+SUP = '^{{{0}}}'
+
+F = {
+	'bar': '\\frac{{{num}}}{{{den}}}',
+	'skw': r'^{{{num}}}/_{{{den}}}',
+	'noBar': '\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}',
+	'lin' : '{{{num}}}/{{{den}}}',
+}
+F_DEFAULT = '\\frac{{{num}}}{{{den}}}'
+
+D  = '\\left{left}{text}\\right{right}'
+
+D_DEFAULT = {
+	'left':'(',
+	'right':')',
+	'null':'.',
+}
+
+RAD = '\\sqrt[{deg}]{{{text}}}'
+
+RAD_DEFAULT = '\\sqrt{{{text}}}'
+
+ARR = '\\begin{{array}}{{c}}{text}\end{{array}}'
+
+LIM_FUNC = {
+	'lim':'\\lim_{{{lim}}}',
+	'max':'\\max_{{{lim}}}',
+	'min':'\\min_{{{lim}}}',
+}
+
+LIM_TO  = ('\\rightarrow','\\to')
+
+LIM_UPP = '\\overset{{{lim}}}{{{text}}}'
+
+M = '\\begin{{matrix}}{text}\end{{matrix}}'
--- a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py
@ -0,0 +1,362 @@
+# -*- coding: utf-8 -*-
+
+"""
+Office Math Markup Language (OMML)
+Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
+On 25/03/2025
+"""
+
+import xml.etree.ElementTree as ET
+
+from experiment.math.latex_dict import (CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT
+	, SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR
+	, LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE)
+
+OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
+
+
+def load(stream):
+	tree = ET.parse(stream)
+	for omath in tree.findall(OMML_NS+'oMath'):
+		yield oMath2Latex(omath)
+
+def load_string(string):
+	root = ET.fromstring(string)
+	for omath in root.findall(OMML_NS+'oMath'):
+		yield oMath2Latex(omath)
+
+def escape_latex(strs):
+	last = None
+	new_chr = []
+	strs = strs.replace(r'\\','\\')
+	for c in strs :
+		if (c in CHARS) and (last !=BACKSLASH):
+			new_chr.append(BACKSLASH+c)
+		else:
+			new_chr.append(c)
+		last = c
+	return BLANK.join(new_chr)
+
+def get_val(key,default=None,store=CHR):
+	if key is not None:
+		return key if not store else store.get(key,key)
+	else:
+		return default
+
+
+class Tag2Method(object):
+
+	def call_method(self,elm,stag=None):
+		getmethod = self.tag2meth.get
+		if stag is None:
+			stag = elm.tag.replace(OMML_NS,'')
+		method = getmethod(stag)
+		if method:
+			return method(self,elm)
+		else:
+			return None
+
+	def process_children_list(self,elm,include=None):
+		"""
+		process children of the elm,return iterable
+		"""		
+		for _e in list(elm):
+			if (OMML_NS not in _e.tag):
+				continue
+			stag = _e.tag.replace(OMML_NS,'')			
+			if include and (stag not in include):
+				continue
+			t = self.call_method(_e,stag=stag)
+			if t is None:
+				t = self.process_unknow(_e,stag)
+				if t is None:
+					continue
+			yield (stag,t,_e)
+
+	def process_children_dict(self,elm,include=None):
+		"""
+		process children of the elm,return dict
+		"""
+		latex_chars = dict()
+		for stag,t,e in self.process_children_list(elm,include):
+			latex_chars[stag] = t
+		return latex_chars
+
+	def process_children(self,elm,include=None):
+		"""
+		process children of the elm,return string
+		"""
+		return BLANK.join(( t if not isinstance(t,Tag2Method) else str(t) 
+			for stag,t,e in self.process_children_list(elm,include)))
+
+	def process_unknow(self,elm,stag):
+		return None
+
+
+class Pr(Tag2Method):
+
+	text = ''
+
+	__val_tags = ('chr','pos','begChr','endChr','type')
+
+	__innerdict= None #can't use the __dict__
+
+	""" common properties of element"""
+	def __init__(self, elm):
+		self.__innerdict={}
+		self.text=self.process_children(elm)
+
+	def __str__(self):
+		return self.text
+
+	def __unicode__(self):
+		return self.__str__(self)
+
+	def __getattr__(self,name):
+		return self.__innerdict.get(name,None)
+
+	def do_brk(self,elm):
+		self.__innerdict['brk'] = BRK 
+		return BRK
+
+	def do_common(self,elm):
+		stag = elm.tag.replace(OMML_NS,'')
+		if stag in self.__val_tags:
+			t = elm.get('{0}val'.format(OMML_NS))
+			self.__innerdict[stag] = t
+		return None
+
+	tag2meth = {
+		'brk':do_brk,
+		'chr':do_common,
+		'pos':do_common,
+		'begChr':do_common,
+		'endChr':do_common,
+		'type':do_common,
+	}
+
+
+class oMath2Latex(Tag2Method):
+	"""
+	Convert oMath element of omml to latex
+	"""
+	_t_dict = T
+
+	__direct_tags = ('box','sSub','sSup','sSubSup','num','den','deg','e')
+
+	def __init__(self, element):
+		self._latex = self.process_children(element)		
+
+	def __str__(self):
+		return self.latex
+
+	def __unicode__(self):
+		return self.__str__(self)
+
+	def process_unknow(self,elm,stag):			
+		if stag in self.__direct_tags:
+			return self.process_children(elm)
+		elif stag[-2:] == 'Pr':
+			return Pr(elm)
+		else:
+			return None
+
+	@property
+	def latex(self):
+		return self._latex
+
+	def do_acc(self,elm):
+		"""
+		the accent function
+		"""
+		c_dict = self.process_children_dict(elm)
+		latex_s = get_val(c_dict['accPr'].chr,default=CHR_DEFAULT.get('ACC_VAL'),store=CHR)
+		return latex_s.format(c_dict['e'])		
+
+	def do_bar(self,elm):
+		"""
+		the bar function
+		"""
+		c_dict = self.process_children_dict(elm)
+		pr = c_dict['barPr']
+		latex_s = get_val(pr.pos,default=POS_DEFAULT.get('BAR_VAL'),store=POS)
+		return pr.text+latex_s.format(c_dict['e'])
+
+	def do_d(self,elm):
+		"""
+		the delimiter object
+		"""
+		c_dict = self.process_children_dict(elm)
+		pr = c_dict['dPr']
+		null = D_DEFAULT.get('null')
+		s_val = get_val(pr.begChr,default=D_DEFAULT.get('left'),store=T)
+		e_val = get_val(pr.endChr,default=D_DEFAULT.get('right'),store=T)
+		return pr.text+D.format(left= null if not s_val else escape_latex(s_val),
+					text=c_dict['e'],
+					right= null if not e_val else  escape_latex(e_val))
+
+
+	def do_spre(self,elm):
+		"""
+		the Pre-Sub-Superscript object -- Not support yet
+		"""
+		pass
+
+	def do_sub(self,elm):
+		text = self.process_children(elm)
+		return SUB.format(text)
+
+	def do_sup(self,elm):
+		text = self.process_children(elm)
+		return SUP.format(text)
+
+	def do_f(self,elm):
+		"""
+		the fraction object
+		"""
+		c_dict = self.process_children_dict(elm)
+		pr = c_dict['fPr']
+		latex_s = get_val(pr.type,default=F_DEFAULT,store=F)
+		return pr.text+latex_s.format(num=c_dict.get('num'),den=c_dict.get('den'))
+
+	def do_func(self,elm):
+		"""
+		the Function-Apply object (Examples:sin cos)
+		"""
+		c_dict = self.process_children_dict(elm)
+		func_name = c_dict.get('fName')
+		return func_name.replace(FUNC_PLACE,c_dict.get('e'))
+
+	def do_fname(self,elm):
+		"""
+		the func name
+		"""
+		latex_chars = []
+		for stag,t,e in self.process_children_list(elm):
+			if stag == 'r':
+				if FUNC.get(t):
+					latex_chars.append(FUNC[t])
+				else :
+					raise NotImplemented("Not support func %s" % t)
+			else:
+				latex_chars.append(t)
+		t = BLANK.join(latex_chars)
+		return t if FUNC_PLACE in t else t+FUNC_PLACE #do_func will replace this
+
+	def do_groupchr(self,elm):
+		"""
+		the Group-Character object
+		"""
+		c_dict = self.process_children_dict(elm)
+		pr = c_dict['groupChrPr']
+		latex_s = get_val(pr.chr)
+		return pr.text+latex_s.format(c_dict['e'])
+
+	def do_rad(self,elm):
+		"""
+		the radical object
+		"""
+		c_dict = self.process_children_dict(elm)
+		text = c_dict.get('e')
+		deg_text = c_dict.get('deg')
+		if deg_text:
+			return RAD.format(deg=deg_text,text=text)
+		else:
+			return RAD_DEFAULT.format(text=text)
+			
+	def do_eqarr(self,elm):
+		"""
+		the Array object
+		"""
+		return ARR.format(text=BRK.join(
+			[t for stag,t,e in self.process_children_list(elm,include=('e',))]))
+
+
+	def do_limlow(self,elm):
+		"""
+		the Lower-Limit object
+		"""
+		t_dict = self.process_children_dict(elm,include=('e','lim'))
+		latex_s = LIM_FUNC.get(t_dict['e'])
+		if not latex_s :
+			raise NotImplemented("Not support lim %s" % t_dict['e'])
+		else:
+			return latex_s.format(lim=t_dict.get('lim'))
+
+	def do_limupp(self,elm):
+		"""
+		the Upper-Limit object
+		"""
+		t_dict = self.process_children_dict(elm,include=('e','lim'))
+		return LIM_UPP.format(lim=t_dict.get('lim'),text=t_dict.get('e'))
+
+	def do_lim(self,elm):
+		"""
+		the lower limit of the limLow object and the upper limit of the limUpp function
+		"""
+		return self.process_children(elm).replace(LIM_TO[0],LIM_TO[1])
+	
+	def do_m(self,elm):
+		"""
+		the Matrix object
+		"""
+		rows = []
+		for stag,t,e in self.process_children_list(elm):
+			if stag is 'mPr':
+				pass
+			elif stag == 'mr':
+				rows.append(t)
+		return M.format(text=BRK.join(rows))
+
+	def do_mr(self,elm):
+		"""
+		a single row of the matrix m
+		"""
+		return ALN.join(
+			[t for stag,t,e in self.process_children_list(elm,include=('e',))])
+
+	def do_nary(self,elm):
+		"""
+		the n-ary object
+		"""
+		res = []
+		bo = ''
+		for stag,t,e in self.process_children_list(elm):
+			if stag == 'naryPr':
+				bo = get_val(t.chr,store=CHR_BO)
+			else :
+				res.append(t)
+		return bo+BLANK.join(res)
+
+	def do_r(self,elm):
+		"""
+		Get text from 'r' element,And try convert them to latex symbols
+		@todo text style support , (sty)
+		@todo \text (latex pure text support)
+		"""
+		_str = []
+		for s in elm.findtext('./{0}t'.format(OMML_NS)):
+			#s = s if isinstance(s,unicode) else unicode(s,'utf-8')
+			_str.append(self._t_dict.get(s,s))
+		return escape_latex(BLANK.join(_str))
+
+	tag2meth={
+		'acc' : do_acc,
+		'r' : do_r,
+		'bar' : do_bar,
+		'sub' : do_sub,
+		'sup' : do_sup,
+		'f'   : do_f,
+		'func': do_func,
+		'fName' : do_fname,
+		'groupChr' : do_groupchr,
+		'd' : do_d,
+		'rad' : do_rad,
+		'eqArr' : do_eqarr,
+		'limLow' : do_limlow,
+		'limUpp' : do_limupp,
+		'lim' : do_lim,
+		'm' : do_m,
+		'mr' : do_mr,
+		'nary' : do_nary,
+ 	}
--- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@ -0,0 +1,156 @@
+import zipfile
+from io import BytesIO
+from typing import BinaryIO
+from xml.etree import ElementTree as ET
+
+from bs4 import BeautifulSoup, Tag
+
+from .math.omml import OMML_NS, oMath2Latex
+
+MATH_ROOT_TEMPLATE = "".join(
+    (
+        "<w:document ",
+        'xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ',
+        'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ',
+        'xmlns:o="urn:schemas-microsoft-com:office:office" ',
+        'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ',
+        'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ',
+        'xmlns:v="urn:schemas-microsoft-com:vml" ',
+        'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ',
+        'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ',
+        'xmlns:w10="urn:schemas-microsoft-com:office:word" ',
+        'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ',
+        'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ',
+        'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ',
+        'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ',
+        'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ',
+        'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">',
+        "{0}</w:document>",
+    )
+)
+
+
+def _convert_omath_to_latex(tag: Tag) -> str:
+    """
+    Converts an OMML (Office Math Markup Language) tag to LaTeX format.
+
+    Args:
+        tag (Tag): A BeautifulSoup Tag object representing the OMML element.
+
+    Returns:
+        str: The LaTeX representation of the OMML element.
+    """
+    # Format the tag into a complete XML document string
+    math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag)))
+    # Find the 'oMath' element within the XML document
+    math_element = math_root.find(OMML_NS + "oMath")
+    # Convert the 'oMath' element to LaTeX using the oMath2Latex function
+    latex = oMath2Latex(math_element).latex
+    return latex
+
+
+def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag:
+    """
+    Creates a replacement tag for an OMML (Office Math Markup Language) element.
+
+    Args:
+        tag (Tag): A BeautifulSoup Tag object representing the "oMath" element.
+        block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False.
+
+    Returns:
+        Tag: A BeautifulSoup Tag object representing the replacement element.
+    """
+    t_tag = Tag(name="w:t")
+    t_tag.string = (
+        f"$${_convert_omath_to_latex(tag)}$$"
+        if block
+        else f"${_convert_omath_to_latex(tag)}$"
+    )
+    r_tag = Tag(name="w:r")
+    r_tag.append(t_tag)
+    return r_tag
+
+
+def _replace_equations(tag: Tag):
+    """
+    Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents.
+
+    Args:
+        tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath".
+
+    Raises:
+        ValueError: If the tag is not supported.
+    """
+    if tag.name == "oMathPara":
+        # Create a new paragraph tag
+        p_tag = Tag(name="w:p")
+        # Replace each 'oMath' child tag with its LaTeX equivalent as block equations
+        for child_tag in tag.find_all("oMath"):
+            p_tag.append(_get_omath_tag_replacement(child_tag, block=True))
+        # Replace the original 'oMathPara' tag with the new paragraph tag
+        tag.replace_with(p_tag)
+    elif tag.name == "oMath":
+        # Replace the 'oMath' tag with its LaTeX equivalent as inline equation
+        tag.replace_with(_get_omath_tag_replacement(tag, block=False))
+    else:
+        raise ValueError(f"Not supported tag: {tag.name}")
+
+
+def _pre_process_math(content: bytes) -> bytes:
+    """
+    Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
+    This preprocessed content can be directly replaced in the DOCX file -> XMLs.
+
+    Args:
+        content (bytes): The XML content of the DOCX file as bytes.
+
+    Returns:
+        bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
+    """
+    soup = BeautifulSoup(content.decode(), features="xml")
+    for tag in soup.find_all("oMathPara"):
+        _replace_equations(tag)
+    for tag in soup.find_all("oMath"):
+        _replace_equations(tag)
+    return str(soup).encode()
+
+
+def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
+    """
+    Pre-processes a DOCX file with provided steps.
+
+    The process works by unzipping the DOCX file in memory, transforming specific XML files
+    (such as converting OMML elements to LaTeX), and then zipping everything back into a
+    DOCX file without writing to disk.
+
+    Args:
+        input_docx (BinaryIO): A binary input stream representing the DOCX file.
+
+    Returns:
+        BinaryIO: A binary output stream representing the processed DOCX file.
+    """
+    output_docx = BytesIO()
+    # The files that need to be pre-processed from .docx
+    pre_process_enable_files = [
+        "word/document.xml",
+        "word/footnotes.xml",
+        "word/endnotes.xml",
+    ]
+    with zipfile.ZipFile(input_docx, mode="r") as zip_input:
+        files = {name: zip_input.read(name) for name in zip_input.namelist()}
+        with zipfile.ZipFile(output_docx, mode="w") as zip_output:
+            zip_output.comment = zip_input.comment
+            for name, content in files.items():
+                if name in pre_process_enable_files:
+                    try:
+                        # Pre-process the content
+                        updated_content = _pre_process_math(content)
+                        # In the future, if there are more pre-processing steps, they can be added here
+                        zip_output.writestr(name, updated_content)
+                    except:
+                        # If there is an error in processing the content, write the original content
+                        zip_output.writestr(name, content)
+                else:
+                    zip_output.writestr(name, content)
+    output_docx.seek(0)
+    return output_docx
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -3,6 +3,7 @@ import sys
 from typing import BinaryIO, Any

 from ._html_converter import HtmlConverter
+from ..converter_utils.docx.pre_process import pre_process_docx
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -72,6 +73,7 @@ class DocxConverter(HtmlConverter):
            )

        style_map = kwargs.get("style_map", None)
+        pre_process_stream = pre_process_docx(file_stream)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
+            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs
        )