feat: math equation rendering in .docx files

This commit is contained in:
Sathindu Ganhala Arachchige 2025-03-27 14:20:21 -04:00
parent 2ffe6ea591
commit 5f2af03aeb
8 changed files with 797 additions and 2 deletions

View file

@ -47,7 +47,7 @@ all = [
"azure-identity"
]
pptx = ["python-pptx"]
docx = ["mammoth"]
docx = ["mammoth", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]

View file

@ -0,0 +1,275 @@
# -*- coding: utf-8 -*-
"""
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 25/03/2025
"""
from __future__ import unicode_literals
CHARS = ('{','}', '_', '^', '#', '&', '$', '%', '~')
BLANK = ''
BACKSLASH = '\\'
ALN = '&'
CHR = {
#Unicode : Latex Math Symbols
#Top accents
'\u0300' : '\\grave{{{0}}}',
'\u0301' : '\\acute{{{0}}}',
'\u0302' : '\\hat{{{0}}}',
'\u0303' : '\\tilde{{{0}}}',
'\u0304' : '\\bar{{{0}}}',
'\u0305' : '\\overbar{{{0}}}',
'\u0306' : '\\breve{{{0}}}',
'\u0307' : '\\dot{{{0}}}',
'\u0308' : '\\ddot{{{0}}}',
'\u0309' : '\\ovhook{{{0}}}',
'\u030a' : '\\ocirc{{{0}}}}',
'\u030c' : '\\check{{{0}}}}',
'\u0310' : '\\candra{{{0}}}',
'\u0312' : '\\oturnedcomma{{{0}}}',
'\u0315' : '\\ocommatopright{{{0}}}',
'\u031a' : '\\droang{{{0}}}',
'\u0338' : '\\not{{{0}}}',
'\u20d0' : '\\leftharpoonaccent{{{0}}}',
'\u20d1' : '\\rightharpoonaccent{{{0}}}',
'\u20d2' : '\\vertoverlay{{{0}}}',
'\u20d6' : '\\overleftarrow{{{0}}}',
'\u20d7' : '\\vec{{{0}}}',
'\u20db' : '\\dddot{{{0}}}',
'\u20dc' : '\\ddddot{{{0}}}',
'\u20e1' : '\\overleftrightarrow{{{0}}}',
'\u20e7' : '\\annuity{{{0}}}',
'\u20e9' : '\\widebridgeabove{{{0}}}',
'\u20f0' : '\\asteraccent{{{0}}}',
#Bottom accents
'\u0330' : '\\wideutilde{{{0}}}',
'\u0331' : '\\underbar{{{0}}}',
'\u20e8' : '\\threeunderdot{{{0}}}',
'\u20ec' : '\\underrightharpoondown{{{0}}}',
'\u20ed' : '\\underleftharpoondown{{{0}}}',
'\u20ee' : '\\underledtarrow{{{0}}}',
'\u20ef' : '\\underrightarrow{{{0}}}',
#Over | group
'\u23b4' : '\\overbracket{{{0}}}',
'\u23dc' : '\\overparen{{{0}}}',
'\u23de' : '\\overbrace{{{0}}}',
#Under| group
'\u23b5' : '\\underbracket{{{0}}}',
'\u23dd' : '\\underparen{{{0}}}',
'\u23df' : '\\underbrace{{{0}}}',
}
CHR_BO = {
#Big operators,
'\u2140' : '\\Bbbsum',
'\u220f' : '\\prod',
'\u2210' : '\\coprod',
'\u2211' : '\\sum',
'\u222b' : '\\int',
'\u22c0' : '\\bigwedge',
'\u22c1' : '\\bigvee',
'\u22c2' : '\\bigcap',
'\u22c3' : '\\bigcup',
'\u2a00' : '\\bigodot',
'\u2a01' : '\\bigoplus',
'\u2a02' : '\\bigotimes',
}
T = {
'\u2192' : '\\rightarrow ',
#Greek letters
'\U0001d6fc' : '\\alpha ',
'\U0001d6fd' : '\\beta ',
'\U0001d6fe' : '\\gamma ',
'\U0001d6ff' : '\\theta ',
'\U0001d700' : '\\epsilon ',
'\U0001d701' : '\\zeta ',
'\U0001d702' : '\\eta ',
'\U0001d703' : '\\theta ',
'\U0001d704' : '\\iota ',
'\U0001d705' : '\\kappa ',
'\U0001d706' : '\\lambda ',
'\U0001d707' : '\\m ',
'\U0001d708' : '\\n ',
'\U0001d709' : '\\xi ',
'\U0001d70a' : '\\omicron ',
'\U0001d70b' : '\\pi ',
'\U0001d70c' : '\\rho ',
'\U0001d70d' : '\\varsigma ',
'\U0001d70e' : '\\sigma ',
'\U0001d70f' : '\\ta ',
'\U0001d710' : '\\upsilon ',
'\U0001d711' : '\\phi ',
'\U0001d712' : '\\chi ',
'\U0001d713' : '\\psi ',
'\U0001d714' : '\\omega ',
'\U0001d715' : '\\partial ',
'\U0001d716' : '\\varepsilon ',
'\U0001d717' : '\\vartheta ',
'\U0001d718' : '\\varkappa ',
'\U0001d719' : '\\varphi ',
'\U0001d71a' : '\\varrho ',
'\U0001d71b' : '\\varpi ',
#Relation symbols
'\u2190' : '\\leftarrow ',
'\u2191' : '\\uparrow ',
'\u2192' : '\\rightarrow ',
'\u2193' : '\\downright ',
'\u2194' : '\\leftrightarrow ',
'\u2195' : '\\updownarrow ',
'\u2196' : '\\nwarrow ',
'\u2197' : '\\nearrow ',
'\u2198' : '\\searrow ',
'\u2199' : '\\swarrow ',
'\u22ee' : '\\vdots ',
'\u22ef' : '\\cdots ',
'\u22f0' : '\\adots ',
'\u22f1' : '\\ddots ',
'\u2260' : '\\ne ',
'\u2264' : '\\leq ',
'\u2265' : '\\geq ',
'\u2266' : '\\leqq ',
'\u2267' : '\\geqq ',
'\u2268' : '\\lneqq ',
'\u2269' : '\\gneqq ',
'\u226a' : '\\ll ',
'\u226b' : '\\gg ',
'\u2208' : '\\in ',
'\u2209' : '\\notin ',
'\u220b' : '\\ni ',
'\u220c' : '\\nni ',
#Ordinary symbols
'\u221e' : '\\infty ',
#Binary relations
'\u00b1' : '\\pm ',
'\u2213' : '\\mp ',
#Italic, Latin, uppercase
'\U0001d434' : 'A',
'\U0001d435' : 'B',
'\U0001d436' : 'C',
'\U0001d437' : 'D',
'\U0001d438' : 'E',
'\U0001d439' : 'F',
'\U0001d43a' : 'G',
'\U0001d43b' : 'H',
'\U0001d43c' : 'I',
'\U0001d43d' : 'J',
'\U0001d43e' : 'K',
'\U0001d43f' : 'L',
'\U0001d440' : 'M',
'\U0001d441' : 'N',
'\U0001d442' : 'O',
'\U0001d443' : 'P',
'\U0001d444' : 'Q',
'\U0001d445' : 'R',
'\U0001d446' : 'S',
'\U0001d447' : 'T',
'\U0001d448' : 'U',
'\U0001d449' : 'V',
'\U0001d44a' : 'W',
'\U0001d44b' : 'X',
'\U0001d44c' : 'Y',
'\U0001d44d' : 'Z',
#Italic, Latin, lowercase
'\U0001d44e' : 'a',
'\U0001d44f' : 'b',
'\U0001d450' : 'c',
'\U0001d451' : 'd',
'\U0001d452' : 'e',
'\U0001d453' : 'f',
'\U0001d454' : 'g',
'\U0001d456' : 'i',
'\U0001d457' : 'j',
'\U0001d458' : 'k',
'\U0001d459' : 'l',
'\U0001d45a' : 'm',
'\U0001d45b' : 'n',
'\U0001d45c' : 'o',
'\U0001d45d' : 'p',
'\U0001d45e' : 'q',
'\U0001d45f' : 'r',
'\U0001d460' : 's',
'\U0001d461' : 't',
'\U0001d462' : 'u',
'\U0001d463' : 'v',
'\U0001d464' : 'w',
'\U0001d465' : 'x',
'\U0001d466' : 'y',
'\U0001d467' : 'z',
}
FUNC ={
'sin' : '\\sin({fe})',
'cos' : '\\cos({fe})',
'tan' : '\\tan({fe})',
'arcsin' : '\\arcsin({fe})',
'arccos' : '\\arccos({fe})',
'arctan' : '\\arctan({fe})',
'arccot' : '\\arccot({fe})',
'sinh' : '\\sinh({fe})',
'cosh' : '\\cosh({fe})',
'tanh' : '\\tanh({fe})',
'coth' : '\\coth({fe})',
'sec' : '\\sec({fe})',
'csc' : '\\csc({fe})',
}
FUNC_PLACE = '{fe}'
BRK = '\\\\'
CHR_DEFAULT = {
'ACC_VAL':'\\hat{{{0}}}',
}
POS = {
'top' : '\\overline{{{0}}}', # not sure
'bot' : '\\underline{{{0}}}',
}
POS_DEFAULT = {
'BAR_VAL': '\\overline{{{0}}}',
}
SUB = '_{{{0}}}'
SUP = '^{{{0}}}'
F = {
'bar': '\\frac{{{num}}}{{{den}}}',
'skw': r'^{{{num}}}/_{{{den}}}',
'noBar': '\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}',
'lin' : '{{{num}}}/{{{den}}}',
}
F_DEFAULT = '\\frac{{{num}}}{{{den}}}'
D = '\\left{left}{text}\\right{right}'
D_DEFAULT = {
'left':'(',
'right':')',
'null':'.',
}
RAD = '\\sqrt[{deg}]{{{text}}}'
RAD_DEFAULT = '\\sqrt{{{text}}}'
ARR = '\\begin{{array}}{{c}}{text}\end{{array}}'
LIM_FUNC = {
'lim':'\\lim_{{{lim}}}',
'max':'\\max_{{{lim}}}',
'min':'\\min_{{{lim}}}',
}
LIM_TO = ('\\rightarrow','\\to')
LIM_UPP = '\\overset{{{lim}}}{{{text}}}'
M = '\\begin{{matrix}}{text}\end{{matrix}}'

View file

@ -0,0 +1,362 @@
# -*- coding: utf-8 -*-
"""
Office Math Markup Language (OMML)
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
On 25/03/2025
"""
import xml.etree.ElementTree as ET
from experiment.math.latex_dict import (CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT
, SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR
, LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE)
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
def load(stream):
tree = ET.parse(stream)
for omath in tree.findall(OMML_NS+'oMath'):
yield oMath2Latex(omath)
def load_string(string):
root = ET.fromstring(string)
for omath in root.findall(OMML_NS+'oMath'):
yield oMath2Latex(omath)
def escape_latex(strs):
last = None
new_chr = []
strs = strs.replace(r'\\','\\')
for c in strs :
if (c in CHARS) and (last !=BACKSLASH):
new_chr.append(BACKSLASH+c)
else:
new_chr.append(c)
last = c
return BLANK.join(new_chr)
def get_val(key,default=None,store=CHR):
if key is not None:
return key if not store else store.get(key,key)
else:
return default
class Tag2Method(object):
def call_method(self,elm,stag=None):
getmethod = self.tag2meth.get
if stag is None:
stag = elm.tag.replace(OMML_NS,'')
method = getmethod(stag)
if method:
return method(self,elm)
else:
return None
def process_children_list(self,elm,include=None):
"""
process children of the elm,return iterable
"""
for _e in list(elm):
if (OMML_NS not in _e.tag):
continue
stag = _e.tag.replace(OMML_NS,'')
if include and (stag not in include):
continue
t = self.call_method(_e,stag=stag)
if t is None:
t = self.process_unknow(_e,stag)
if t is None:
continue
yield (stag,t,_e)
def process_children_dict(self,elm,include=None):
"""
process children of the elm,return dict
"""
latex_chars = dict()
for stag,t,e in self.process_children_list(elm,include):
latex_chars[stag] = t
return latex_chars
def process_children(self,elm,include=None):
"""
process children of the elm,return string
"""
return BLANK.join(( t if not isinstance(t,Tag2Method) else str(t)
for stag,t,e in self.process_children_list(elm,include)))
def process_unknow(self,elm,stag):
return None
class Pr(Tag2Method):
text = ''
__val_tags = ('chr','pos','begChr','endChr','type')
__innerdict= None #can't use the __dict__
""" common properties of element"""
def __init__(self, elm):
self.__innerdict={}
self.text=self.process_children(elm)
def __str__(self):
return self.text
def __unicode__(self):
return self.__str__(self)
def __getattr__(self,name):
return self.__innerdict.get(name,None)
def do_brk(self,elm):
self.__innerdict['brk'] = BRK
return BRK
def do_common(self,elm):
stag = elm.tag.replace(OMML_NS,'')
if stag in self.__val_tags:
t = elm.get('{0}val'.format(OMML_NS))
self.__innerdict[stag] = t
return None
tag2meth = {
'brk':do_brk,
'chr':do_common,
'pos':do_common,
'begChr':do_common,
'endChr':do_common,
'type':do_common,
}
class oMath2Latex(Tag2Method):
"""
Convert oMath element of omml to latex
"""
_t_dict = T
__direct_tags = ('box','sSub','sSup','sSubSup','num','den','deg','e')
def __init__(self, element):
self._latex = self.process_children(element)
def __str__(self):
return self.latex
def __unicode__(self):
return self.__str__(self)
def process_unknow(self,elm,stag):
if stag in self.__direct_tags:
return self.process_children(elm)
elif stag[-2:] == 'Pr':
return Pr(elm)
else:
return None
@property
def latex(self):
return self._latex
def do_acc(self,elm):
"""
the accent function
"""
c_dict = self.process_children_dict(elm)
latex_s = get_val(c_dict['accPr'].chr,default=CHR_DEFAULT.get('ACC_VAL'),store=CHR)
return latex_s.format(c_dict['e'])
def do_bar(self,elm):
"""
the bar function
"""
c_dict = self.process_children_dict(elm)
pr = c_dict['barPr']
latex_s = get_val(pr.pos,default=POS_DEFAULT.get('BAR_VAL'),store=POS)
return pr.text+latex_s.format(c_dict['e'])
def do_d(self,elm):
"""
the delimiter object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict['dPr']
null = D_DEFAULT.get('null')
s_val = get_val(pr.begChr,default=D_DEFAULT.get('left'),store=T)
e_val = get_val(pr.endChr,default=D_DEFAULT.get('right'),store=T)
return pr.text+D.format(left= null if not s_val else escape_latex(s_val),
text=c_dict['e'],
right= null if not e_val else escape_latex(e_val))
def do_spre(self,elm):
"""
the Pre-Sub-Superscript object -- Not support yet
"""
pass
def do_sub(self,elm):
text = self.process_children(elm)
return SUB.format(text)
def do_sup(self,elm):
text = self.process_children(elm)
return SUP.format(text)
def do_f(self,elm):
"""
the fraction object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict['fPr']
latex_s = get_val(pr.type,default=F_DEFAULT,store=F)
return pr.text+latex_s.format(num=c_dict.get('num'),den=c_dict.get('den'))
def do_func(self,elm):
"""
the Function-Apply object (Examples:sin cos)
"""
c_dict = self.process_children_dict(elm)
func_name = c_dict.get('fName')
return func_name.replace(FUNC_PLACE,c_dict.get('e'))
def do_fname(self,elm):
"""
the func name
"""
latex_chars = []
for stag,t,e in self.process_children_list(elm):
if stag == 'r':
if FUNC.get(t):
latex_chars.append(FUNC[t])
else :
raise NotImplemented("Not support func %s" % t)
else:
latex_chars.append(t)
t = BLANK.join(latex_chars)
return t if FUNC_PLACE in t else t+FUNC_PLACE #do_func will replace this
def do_groupchr(self,elm):
"""
the Group-Character object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict['groupChrPr']
latex_s = get_val(pr.chr)
return pr.text+latex_s.format(c_dict['e'])
def do_rad(self,elm):
"""
the radical object
"""
c_dict = self.process_children_dict(elm)
text = c_dict.get('e')
deg_text = c_dict.get('deg')
if deg_text:
return RAD.format(deg=deg_text,text=text)
else:
return RAD_DEFAULT.format(text=text)
def do_eqarr(self,elm):
"""
the Array object
"""
return ARR.format(text=BRK.join(
[t for stag,t,e in self.process_children_list(elm,include=('e',))]))
def do_limlow(self,elm):
"""
the Lower-Limit object
"""
t_dict = self.process_children_dict(elm,include=('e','lim'))
latex_s = LIM_FUNC.get(t_dict['e'])
if not latex_s :
raise NotImplemented("Not support lim %s" % t_dict['e'])
else:
return latex_s.format(lim=t_dict.get('lim'))
def do_limupp(self,elm):
"""
the Upper-Limit object
"""
t_dict = self.process_children_dict(elm,include=('e','lim'))
return LIM_UPP.format(lim=t_dict.get('lim'),text=t_dict.get('e'))
def do_lim(self,elm):
"""
the lower limit of the limLow object and the upper limit of the limUpp function
"""
return self.process_children(elm).replace(LIM_TO[0],LIM_TO[1])
def do_m(self,elm):
"""
the Matrix object
"""
rows = []
for stag,t,e in self.process_children_list(elm):
if stag is 'mPr':
pass
elif stag == 'mr':
rows.append(t)
return M.format(text=BRK.join(rows))
def do_mr(self,elm):
"""
a single row of the matrix m
"""
return ALN.join(
[t for stag,t,e in self.process_children_list(elm,include=('e',))])
def do_nary(self,elm):
"""
the n-ary object
"""
res = []
bo = ''
for stag,t,e in self.process_children_list(elm):
if stag == 'naryPr':
bo = get_val(t.chr,store=CHR_BO)
else :
res.append(t)
return bo+BLANK.join(res)
def do_r(self,elm):
"""
Get text from 'r' element,And try convert them to latex symbols
@todo text style support , (sty)
@todo \text (latex pure text support)
"""
_str = []
for s in elm.findtext('./{0}t'.format(OMML_NS)):
#s = s if isinstance(s,unicode) else unicode(s,'utf-8')
_str.append(self._t_dict.get(s,s))
return escape_latex(BLANK.join(_str))
tag2meth={
'acc' : do_acc,
'r' : do_r,
'bar' : do_bar,
'sub' : do_sub,
'sup' : do_sup,
'f' : do_f,
'func': do_func,
'fName' : do_fname,
'groupChr' : do_groupchr,
'd' : do_d,
'rad' : do_rad,
'eqArr' : do_eqarr,
'limLow' : do_limlow,
'limUpp' : do_limupp,
'lim' : do_lim,
'm' : do_m,
'mr' : do_mr,
'nary' : do_nary,
}

View file

@ -0,0 +1,156 @@
import zipfile
from io import BytesIO
from typing import BinaryIO
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup, Tag
from .math.omml import OMML_NS, oMath2Latex
MATH_ROOT_TEMPLATE = "".join(
(
"<w:document ",
'xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ',
'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ',
'xmlns:o="urn:schemas-microsoft-com:office:office" ',
'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ',
'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ',
'xmlns:v="urn:schemas-microsoft-com:vml" ',
'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ',
'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ',
'xmlns:w10="urn:schemas-microsoft-com:office:word" ',
'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ',
'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ',
'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ',
'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ',
'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ',
'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">',
"{0}</w:document>",
)
)
def _convert_omath_to_latex(tag: Tag) -> str:
"""
Converts an OMML (Office Math Markup Language) tag to LaTeX format.
Args:
tag (Tag): A BeautifulSoup Tag object representing the OMML element.
Returns:
str: The LaTeX representation of the OMML element.
"""
# Format the tag into a complete XML document string
math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag)))
# Find the 'oMath' element within the XML document
math_element = math_root.find(OMML_NS + "oMath")
# Convert the 'oMath' element to LaTeX using the oMath2Latex function
latex = oMath2Latex(math_element).latex
return latex
def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag:
"""
Creates a replacement tag for an OMML (Office Math Markup Language) element.
Args:
tag (Tag): A BeautifulSoup Tag object representing the "oMath" element.
block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False.
Returns:
Tag: A BeautifulSoup Tag object representing the replacement element.
"""
t_tag = Tag(name="w:t")
t_tag.string = (
f"$${_convert_omath_to_latex(tag)}$$"
if block
else f"${_convert_omath_to_latex(tag)}$"
)
r_tag = Tag(name="w:r")
r_tag.append(t_tag)
return r_tag
def _replace_equations(tag: Tag):
"""
Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents.
Args:
tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath".
Raises:
ValueError: If the tag is not supported.
"""
if tag.name == "oMathPara":
# Create a new paragraph tag
p_tag = Tag(name="w:p")
# Replace each 'oMath' child tag with its LaTeX equivalent as block equations
for child_tag in tag.find_all("oMath"):
p_tag.append(_get_omath_tag_replacement(child_tag, block=True))
# Replace the original 'oMathPara' tag with the new paragraph tag
tag.replace_with(p_tag)
elif tag.name == "oMath":
# Replace the 'oMath' tag with its LaTeX equivalent as inline equation
tag.replace_with(_get_omath_tag_replacement(tag, block=False))
else:
raise ValueError(f"Not supported tag: {tag.name}")
def _pre_process_math(content: bytes) -> bytes:
"""
Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
This preprocessed content can be directly replaced in the DOCX file -> XMLs.
Args:
content (bytes): The XML content of the DOCX file as bytes.
Returns:
bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
"""
soup = BeautifulSoup(content.decode(), features="xml")
for tag in soup.find_all("oMathPara"):
_replace_equations(tag)
for tag in soup.find_all("oMath"):
_replace_equations(tag)
return str(soup).encode()
def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
"""
Pre-processes a DOCX file with provided steps.
The process works by unzipping the DOCX file in memory, transforming specific XML files
(such as converting OMML elements to LaTeX), and then zipping everything back into a
DOCX file without writing to disk.
Args:
input_docx (BinaryIO): A binary input stream representing the DOCX file.
Returns:
BinaryIO: A binary output stream representing the processed DOCX file.
"""
output_docx = BytesIO()
# The files that need to be pre-processed from .docx
pre_process_enable_files = [
"word/document.xml",
"word/footnotes.xml",
"word/endnotes.xml",
]
with zipfile.ZipFile(input_docx, mode="r") as zip_input:
files = {name: zip_input.read(name) for name in zip_input.namelist()}
with zipfile.ZipFile(output_docx, mode="w") as zip_output:
zip_output.comment = zip_input.comment
for name, content in files.items():
if name in pre_process_enable_files:
try:
# Pre-process the content
updated_content = _pre_process_math(content)
# In the future, if there are more pre-processing steps, they can be added here
zip_output.writestr(name, updated_content)
except:
# If there is an error in processing the content, write the original content
zip_output.writestr(name, content)
else:
zip_output.writestr(name, content)
output_docx.seek(0)
return output_docx

View file

@ -3,6 +3,7 @@ import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -72,6 +73,7 @@ class DocxConverter(HtmlConverter):
)
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs
)