feat: math equation rendering in .docx files
This commit is contained in:
parent
2ffe6ea591
commit
5f2af03aeb
8 changed files with 797 additions and 2 deletions
|
|
@ -47,7 +47,7 @@ all = [
|
||||||
"azure-identity"
|
"azure-identity"
|
||||||
]
|
]
|
||||||
pptx = ["python-pptx"]
|
pptx = ["python-pptx"]
|
||||||
docx = ["mammoth"]
|
docx = ["mammoth", "lxml"]
|
||||||
xlsx = ["pandas", "openpyxl"]
|
xlsx = ["pandas", "openpyxl"]
|
||||||
xls = ["pandas", "xlrd"]
|
xls = ["pandas", "xlrd"]
|
||||||
pdf = ["pdfminer.six"]
|
pdf = ["pdfminer.six"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,275 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||||
|
On 25/03/2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
CHARS = ('{','}', '_', '^', '#', '&', '$', '%', '~')
|
||||||
|
|
||||||
|
BLANK = ''
|
||||||
|
BACKSLASH = '\\'
|
||||||
|
ALN = '&'
|
||||||
|
|
||||||
|
CHR = {
|
||||||
|
#Unicode : Latex Math Symbols
|
||||||
|
#Top accents
|
||||||
|
'\u0300' : '\\grave{{{0}}}',
|
||||||
|
'\u0301' : '\\acute{{{0}}}',
|
||||||
|
'\u0302' : '\\hat{{{0}}}',
|
||||||
|
'\u0303' : '\\tilde{{{0}}}',
|
||||||
|
'\u0304' : '\\bar{{{0}}}',
|
||||||
|
'\u0305' : '\\overbar{{{0}}}',
|
||||||
|
'\u0306' : '\\breve{{{0}}}',
|
||||||
|
'\u0307' : '\\dot{{{0}}}',
|
||||||
|
'\u0308' : '\\ddot{{{0}}}',
|
||||||
|
'\u0309' : '\\ovhook{{{0}}}',
|
||||||
|
'\u030a' : '\\ocirc{{{0}}}}',
|
||||||
|
'\u030c' : '\\check{{{0}}}}',
|
||||||
|
'\u0310' : '\\candra{{{0}}}',
|
||||||
|
'\u0312' : '\\oturnedcomma{{{0}}}',
|
||||||
|
'\u0315' : '\\ocommatopright{{{0}}}',
|
||||||
|
'\u031a' : '\\droang{{{0}}}',
|
||||||
|
'\u0338' : '\\not{{{0}}}',
|
||||||
|
'\u20d0' : '\\leftharpoonaccent{{{0}}}',
|
||||||
|
'\u20d1' : '\\rightharpoonaccent{{{0}}}',
|
||||||
|
'\u20d2' : '\\vertoverlay{{{0}}}',
|
||||||
|
'\u20d6' : '\\overleftarrow{{{0}}}',
|
||||||
|
'\u20d7' : '\\vec{{{0}}}',
|
||||||
|
'\u20db' : '\\dddot{{{0}}}',
|
||||||
|
'\u20dc' : '\\ddddot{{{0}}}',
|
||||||
|
'\u20e1' : '\\overleftrightarrow{{{0}}}',
|
||||||
|
'\u20e7' : '\\annuity{{{0}}}',
|
||||||
|
'\u20e9' : '\\widebridgeabove{{{0}}}',
|
||||||
|
'\u20f0' : '\\asteraccent{{{0}}}',
|
||||||
|
#Bottom accents
|
||||||
|
'\u0330' : '\\wideutilde{{{0}}}',
|
||||||
|
'\u0331' : '\\underbar{{{0}}}',
|
||||||
|
'\u20e8' : '\\threeunderdot{{{0}}}',
|
||||||
|
'\u20ec' : '\\underrightharpoondown{{{0}}}',
|
||||||
|
'\u20ed' : '\\underleftharpoondown{{{0}}}',
|
||||||
|
'\u20ee' : '\\underledtarrow{{{0}}}',
|
||||||
|
'\u20ef' : '\\underrightarrow{{{0}}}',
|
||||||
|
#Over | group
|
||||||
|
'\u23b4' : '\\overbracket{{{0}}}',
|
||||||
|
'\u23dc' : '\\overparen{{{0}}}',
|
||||||
|
'\u23de' : '\\overbrace{{{0}}}',
|
||||||
|
#Under| group
|
||||||
|
'\u23b5' : '\\underbracket{{{0}}}',
|
||||||
|
'\u23dd' : '\\underparen{{{0}}}',
|
||||||
|
'\u23df' : '\\underbrace{{{0}}}',
|
||||||
|
}
|
||||||
|
|
||||||
|
CHR_BO = {
|
||||||
|
#Big operators,
|
||||||
|
'\u2140' : '\\Bbbsum',
|
||||||
|
'\u220f' : '\\prod',
|
||||||
|
'\u2210' : '\\coprod',
|
||||||
|
'\u2211' : '\\sum',
|
||||||
|
'\u222b' : '\\int',
|
||||||
|
'\u22c0' : '\\bigwedge',
|
||||||
|
'\u22c1' : '\\bigvee',
|
||||||
|
'\u22c2' : '\\bigcap',
|
||||||
|
'\u22c3' : '\\bigcup',
|
||||||
|
'\u2a00' : '\\bigodot',
|
||||||
|
'\u2a01' : '\\bigoplus',
|
||||||
|
'\u2a02' : '\\bigotimes',
|
||||||
|
}
|
||||||
|
|
||||||
|
T = {
|
||||||
|
|
||||||
|
'\u2192' : '\\rightarrow ',
|
||||||
|
#Greek letters
|
||||||
|
'\U0001d6fc' : '\\alpha ',
|
||||||
|
'\U0001d6fd' : '\\beta ',
|
||||||
|
'\U0001d6fe' : '\\gamma ',
|
||||||
|
'\U0001d6ff' : '\\theta ',
|
||||||
|
'\U0001d700' : '\\epsilon ',
|
||||||
|
'\U0001d701' : '\\zeta ',
|
||||||
|
'\U0001d702' : '\\eta ',
|
||||||
|
'\U0001d703' : '\\theta ',
|
||||||
|
'\U0001d704' : '\\iota ',
|
||||||
|
'\U0001d705' : '\\kappa ',
|
||||||
|
'\U0001d706' : '\\lambda ',
|
||||||
|
'\U0001d707' : '\\m ',
|
||||||
|
'\U0001d708' : '\\n ',
|
||||||
|
'\U0001d709' : '\\xi ',
|
||||||
|
'\U0001d70a' : '\\omicron ',
|
||||||
|
'\U0001d70b' : '\\pi ',
|
||||||
|
'\U0001d70c' : '\\rho ',
|
||||||
|
'\U0001d70d' : '\\varsigma ',
|
||||||
|
'\U0001d70e' : '\\sigma ',
|
||||||
|
'\U0001d70f' : '\\ta ',
|
||||||
|
'\U0001d710' : '\\upsilon ',
|
||||||
|
'\U0001d711' : '\\phi ',
|
||||||
|
'\U0001d712' : '\\chi ',
|
||||||
|
'\U0001d713' : '\\psi ',
|
||||||
|
'\U0001d714' : '\\omega ',
|
||||||
|
'\U0001d715' : '\\partial ',
|
||||||
|
'\U0001d716' : '\\varepsilon ',
|
||||||
|
'\U0001d717' : '\\vartheta ',
|
||||||
|
'\U0001d718' : '\\varkappa ',
|
||||||
|
'\U0001d719' : '\\varphi ',
|
||||||
|
'\U0001d71a' : '\\varrho ',
|
||||||
|
'\U0001d71b' : '\\varpi ',
|
||||||
|
#Relation symbols
|
||||||
|
'\u2190' : '\\leftarrow ',
|
||||||
|
'\u2191' : '\\uparrow ',
|
||||||
|
'\u2192' : '\\rightarrow ',
|
||||||
|
'\u2193' : '\\downright ',
|
||||||
|
'\u2194' : '\\leftrightarrow ',
|
||||||
|
'\u2195' : '\\updownarrow ',
|
||||||
|
'\u2196' : '\\nwarrow ',
|
||||||
|
'\u2197' : '\\nearrow ',
|
||||||
|
'\u2198' : '\\searrow ',
|
||||||
|
'\u2199' : '\\swarrow ',
|
||||||
|
'\u22ee' : '\\vdots ',
|
||||||
|
'\u22ef' : '\\cdots ',
|
||||||
|
'\u22f0' : '\\adots ',
|
||||||
|
'\u22f1' : '\\ddots ',
|
||||||
|
'\u2260' : '\\ne ',
|
||||||
|
'\u2264' : '\\leq ',
|
||||||
|
'\u2265' : '\\geq ',
|
||||||
|
'\u2266' : '\\leqq ',
|
||||||
|
'\u2267' : '\\geqq ',
|
||||||
|
'\u2268' : '\\lneqq ',
|
||||||
|
'\u2269' : '\\gneqq ',
|
||||||
|
'\u226a' : '\\ll ',
|
||||||
|
'\u226b' : '\\gg ',
|
||||||
|
'\u2208' : '\\in ',
|
||||||
|
'\u2209' : '\\notin ',
|
||||||
|
'\u220b' : '\\ni ',
|
||||||
|
'\u220c' : '\\nni ',
|
||||||
|
|
||||||
|
#Ordinary symbols
|
||||||
|
'\u221e' : '\\infty ',
|
||||||
|
#Binary relations
|
||||||
|
'\u00b1' : '\\pm ',
|
||||||
|
'\u2213' : '\\mp ',
|
||||||
|
#Italic, Latin, uppercase
|
||||||
|
'\U0001d434' : 'A',
|
||||||
|
'\U0001d435' : 'B',
|
||||||
|
'\U0001d436' : 'C',
|
||||||
|
'\U0001d437' : 'D',
|
||||||
|
'\U0001d438' : 'E',
|
||||||
|
'\U0001d439' : 'F',
|
||||||
|
'\U0001d43a' : 'G',
|
||||||
|
'\U0001d43b' : 'H',
|
||||||
|
'\U0001d43c' : 'I',
|
||||||
|
'\U0001d43d' : 'J',
|
||||||
|
'\U0001d43e' : 'K',
|
||||||
|
'\U0001d43f' : 'L',
|
||||||
|
'\U0001d440' : 'M',
|
||||||
|
'\U0001d441' : 'N',
|
||||||
|
'\U0001d442' : 'O',
|
||||||
|
'\U0001d443' : 'P',
|
||||||
|
'\U0001d444' : 'Q',
|
||||||
|
'\U0001d445' : 'R',
|
||||||
|
'\U0001d446' : 'S',
|
||||||
|
'\U0001d447' : 'T',
|
||||||
|
'\U0001d448' : 'U',
|
||||||
|
'\U0001d449' : 'V',
|
||||||
|
'\U0001d44a' : 'W',
|
||||||
|
'\U0001d44b' : 'X',
|
||||||
|
'\U0001d44c' : 'Y',
|
||||||
|
'\U0001d44d' : 'Z',
|
||||||
|
#Italic, Latin, lowercase
|
||||||
|
'\U0001d44e' : 'a',
|
||||||
|
'\U0001d44f' : 'b',
|
||||||
|
'\U0001d450' : 'c',
|
||||||
|
'\U0001d451' : 'd',
|
||||||
|
'\U0001d452' : 'e',
|
||||||
|
'\U0001d453' : 'f',
|
||||||
|
'\U0001d454' : 'g',
|
||||||
|
'\U0001d456' : 'i',
|
||||||
|
'\U0001d457' : 'j',
|
||||||
|
'\U0001d458' : 'k',
|
||||||
|
'\U0001d459' : 'l',
|
||||||
|
'\U0001d45a' : 'm',
|
||||||
|
'\U0001d45b' : 'n',
|
||||||
|
'\U0001d45c' : 'o',
|
||||||
|
'\U0001d45d' : 'p',
|
||||||
|
'\U0001d45e' : 'q',
|
||||||
|
'\U0001d45f' : 'r',
|
||||||
|
'\U0001d460' : 's',
|
||||||
|
'\U0001d461' : 't',
|
||||||
|
'\U0001d462' : 'u',
|
||||||
|
'\U0001d463' : 'v',
|
||||||
|
'\U0001d464' : 'w',
|
||||||
|
'\U0001d465' : 'x',
|
||||||
|
'\U0001d466' : 'y',
|
||||||
|
'\U0001d467' : 'z',
|
||||||
|
}
|
||||||
|
|
||||||
|
FUNC ={
|
||||||
|
'sin' : '\\sin({fe})',
|
||||||
|
'cos' : '\\cos({fe})',
|
||||||
|
'tan' : '\\tan({fe})',
|
||||||
|
'arcsin' : '\\arcsin({fe})',
|
||||||
|
'arccos' : '\\arccos({fe})',
|
||||||
|
'arctan' : '\\arctan({fe})',
|
||||||
|
'arccot' : '\\arccot({fe})',
|
||||||
|
'sinh' : '\\sinh({fe})',
|
||||||
|
'cosh' : '\\cosh({fe})',
|
||||||
|
'tanh' : '\\tanh({fe})',
|
||||||
|
'coth' : '\\coth({fe})',
|
||||||
|
'sec' : '\\sec({fe})',
|
||||||
|
'csc' : '\\csc({fe})',
|
||||||
|
}
|
||||||
|
|
||||||
|
FUNC_PLACE = '{fe}'
|
||||||
|
|
||||||
|
BRK = '\\\\'
|
||||||
|
|
||||||
|
CHR_DEFAULT = {
|
||||||
|
'ACC_VAL':'\\hat{{{0}}}',
|
||||||
|
}
|
||||||
|
|
||||||
|
POS = {
|
||||||
|
'top' : '\\overline{{{0}}}', # not sure
|
||||||
|
'bot' : '\\underline{{{0}}}',
|
||||||
|
}
|
||||||
|
|
||||||
|
POS_DEFAULT = {
|
||||||
|
'BAR_VAL': '\\overline{{{0}}}',
|
||||||
|
}
|
||||||
|
|
||||||
|
SUB = '_{{{0}}}'
|
||||||
|
|
||||||
|
SUP = '^{{{0}}}'
|
||||||
|
|
||||||
|
F = {
|
||||||
|
'bar': '\\frac{{{num}}}{{{den}}}',
|
||||||
|
'skw': r'^{{{num}}}/_{{{den}}}',
|
||||||
|
'noBar': '\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}',
|
||||||
|
'lin' : '{{{num}}}/{{{den}}}',
|
||||||
|
}
|
||||||
|
F_DEFAULT = '\\frac{{{num}}}{{{den}}}'
|
||||||
|
|
||||||
|
D = '\\left{left}{text}\\right{right}'
|
||||||
|
|
||||||
|
D_DEFAULT = {
|
||||||
|
'left':'(',
|
||||||
|
'right':')',
|
||||||
|
'null':'.',
|
||||||
|
}
|
||||||
|
|
||||||
|
RAD = '\\sqrt[{deg}]{{{text}}}'
|
||||||
|
|
||||||
|
RAD_DEFAULT = '\\sqrt{{{text}}}'
|
||||||
|
|
||||||
|
ARR = '\\begin{{array}}{{c}}{text}\end{{array}}'
|
||||||
|
|
||||||
|
LIM_FUNC = {
|
||||||
|
'lim':'\\lim_{{{lim}}}',
|
||||||
|
'max':'\\max_{{{lim}}}',
|
||||||
|
'min':'\\min_{{{lim}}}',
|
||||||
|
}
|
||||||
|
|
||||||
|
LIM_TO = ('\\rightarrow','\\to')
|
||||||
|
|
||||||
|
LIM_UPP = '\\overset{{{lim}}}{{{text}}}'
|
||||||
|
|
||||||
|
M = '\\begin{{matrix}}{text}\end{{matrix}}'
|
||||||
|
|
@ -0,0 +1,362 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Office Math Markup Language (OMML)
|
||||||
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
||||||
|
On 25/03/2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from experiment.math.latex_dict import (CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT
|
||||||
|
, SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR
|
||||||
|
, LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE)
|
||||||
|
|
||||||
|
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
||||||
|
|
||||||
|
|
||||||
|
def load(stream):
|
||||||
|
tree = ET.parse(stream)
|
||||||
|
for omath in tree.findall(OMML_NS+'oMath'):
|
||||||
|
yield oMath2Latex(omath)
|
||||||
|
|
||||||
|
def load_string(string):
|
||||||
|
root = ET.fromstring(string)
|
||||||
|
for omath in root.findall(OMML_NS+'oMath'):
|
||||||
|
yield oMath2Latex(omath)
|
||||||
|
|
||||||
|
def escape_latex(strs):
|
||||||
|
last = None
|
||||||
|
new_chr = []
|
||||||
|
strs = strs.replace(r'\\','\\')
|
||||||
|
for c in strs :
|
||||||
|
if (c in CHARS) and (last !=BACKSLASH):
|
||||||
|
new_chr.append(BACKSLASH+c)
|
||||||
|
else:
|
||||||
|
new_chr.append(c)
|
||||||
|
last = c
|
||||||
|
return BLANK.join(new_chr)
|
||||||
|
|
||||||
|
def get_val(key,default=None,store=CHR):
|
||||||
|
if key is not None:
|
||||||
|
return key if not store else store.get(key,key)
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
class Tag2Method(object):
|
||||||
|
|
||||||
|
def call_method(self,elm,stag=None):
|
||||||
|
getmethod = self.tag2meth.get
|
||||||
|
if stag is None:
|
||||||
|
stag = elm.tag.replace(OMML_NS,'')
|
||||||
|
method = getmethod(stag)
|
||||||
|
if method:
|
||||||
|
return method(self,elm)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_children_list(self,elm,include=None):
|
||||||
|
"""
|
||||||
|
process children of the elm,return iterable
|
||||||
|
"""
|
||||||
|
for _e in list(elm):
|
||||||
|
if (OMML_NS not in _e.tag):
|
||||||
|
continue
|
||||||
|
stag = _e.tag.replace(OMML_NS,'')
|
||||||
|
if include and (stag not in include):
|
||||||
|
continue
|
||||||
|
t = self.call_method(_e,stag=stag)
|
||||||
|
if t is None:
|
||||||
|
t = self.process_unknow(_e,stag)
|
||||||
|
if t is None:
|
||||||
|
continue
|
||||||
|
yield (stag,t,_e)
|
||||||
|
|
||||||
|
def process_children_dict(self,elm,include=None):
|
||||||
|
"""
|
||||||
|
process children of the elm,return dict
|
||||||
|
"""
|
||||||
|
latex_chars = dict()
|
||||||
|
for stag,t,e in self.process_children_list(elm,include):
|
||||||
|
latex_chars[stag] = t
|
||||||
|
return latex_chars
|
||||||
|
|
||||||
|
def process_children(self,elm,include=None):
|
||||||
|
"""
|
||||||
|
process children of the elm,return string
|
||||||
|
"""
|
||||||
|
return BLANK.join(( t if not isinstance(t,Tag2Method) else str(t)
|
||||||
|
for stag,t,e in self.process_children_list(elm,include)))
|
||||||
|
|
||||||
|
def process_unknow(self,elm,stag):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class Pr(Tag2Method):
|
||||||
|
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
__val_tags = ('chr','pos','begChr','endChr','type')
|
||||||
|
|
||||||
|
__innerdict= None #can't use the __dict__
|
||||||
|
|
||||||
|
""" common properties of element"""
|
||||||
|
def __init__(self, elm):
|
||||||
|
self.__innerdict={}
|
||||||
|
self.text=self.process_children(elm)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.__str__(self)
|
||||||
|
|
||||||
|
def __getattr__(self,name):
|
||||||
|
return self.__innerdict.get(name,None)
|
||||||
|
|
||||||
|
def do_brk(self,elm):
|
||||||
|
self.__innerdict['brk'] = BRK
|
||||||
|
return BRK
|
||||||
|
|
||||||
|
def do_common(self,elm):
|
||||||
|
stag = elm.tag.replace(OMML_NS,'')
|
||||||
|
if stag in self.__val_tags:
|
||||||
|
t = elm.get('{0}val'.format(OMML_NS))
|
||||||
|
self.__innerdict[stag] = t
|
||||||
|
return None
|
||||||
|
|
||||||
|
tag2meth = {
|
||||||
|
'brk':do_brk,
|
||||||
|
'chr':do_common,
|
||||||
|
'pos':do_common,
|
||||||
|
'begChr':do_common,
|
||||||
|
'endChr':do_common,
|
||||||
|
'type':do_common,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class oMath2Latex(Tag2Method):
|
||||||
|
"""
|
||||||
|
Convert oMath element of omml to latex
|
||||||
|
"""
|
||||||
|
_t_dict = T
|
||||||
|
|
||||||
|
__direct_tags = ('box','sSub','sSup','sSubSup','num','den','deg','e')
|
||||||
|
|
||||||
|
def __init__(self, element):
|
||||||
|
self._latex = self.process_children(element)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.latex
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.__str__(self)
|
||||||
|
|
||||||
|
def process_unknow(self,elm,stag):
|
||||||
|
if stag in self.__direct_tags:
|
||||||
|
return self.process_children(elm)
|
||||||
|
elif stag[-2:] == 'Pr':
|
||||||
|
return Pr(elm)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def latex(self):
|
||||||
|
return self._latex
|
||||||
|
|
||||||
|
def do_acc(self,elm):
|
||||||
|
"""
|
||||||
|
the accent function
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
latex_s = get_val(c_dict['accPr'].chr,default=CHR_DEFAULT.get('ACC_VAL'),store=CHR)
|
||||||
|
return latex_s.format(c_dict['e'])
|
||||||
|
|
||||||
|
def do_bar(self,elm):
|
||||||
|
"""
|
||||||
|
the bar function
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict['barPr']
|
||||||
|
latex_s = get_val(pr.pos,default=POS_DEFAULT.get('BAR_VAL'),store=POS)
|
||||||
|
return pr.text+latex_s.format(c_dict['e'])
|
||||||
|
|
||||||
|
def do_d(self,elm):
|
||||||
|
"""
|
||||||
|
the delimiter object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict['dPr']
|
||||||
|
null = D_DEFAULT.get('null')
|
||||||
|
s_val = get_val(pr.begChr,default=D_DEFAULT.get('left'),store=T)
|
||||||
|
e_val = get_val(pr.endChr,default=D_DEFAULT.get('right'),store=T)
|
||||||
|
return pr.text+D.format(left= null if not s_val else escape_latex(s_val),
|
||||||
|
text=c_dict['e'],
|
||||||
|
right= null if not e_val else escape_latex(e_val))
|
||||||
|
|
||||||
|
|
||||||
|
def do_spre(self,elm):
|
||||||
|
"""
|
||||||
|
the Pre-Sub-Superscript object -- Not support yet
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def do_sub(self,elm):
|
||||||
|
text = self.process_children(elm)
|
||||||
|
return SUB.format(text)
|
||||||
|
|
||||||
|
def do_sup(self,elm):
|
||||||
|
text = self.process_children(elm)
|
||||||
|
return SUP.format(text)
|
||||||
|
|
||||||
|
def do_f(self,elm):
|
||||||
|
"""
|
||||||
|
the fraction object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict['fPr']
|
||||||
|
latex_s = get_val(pr.type,default=F_DEFAULT,store=F)
|
||||||
|
return pr.text+latex_s.format(num=c_dict.get('num'),den=c_dict.get('den'))
|
||||||
|
|
||||||
|
def do_func(self,elm):
|
||||||
|
"""
|
||||||
|
the Function-Apply object (Examples:sin cos)
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
func_name = c_dict.get('fName')
|
||||||
|
return func_name.replace(FUNC_PLACE,c_dict.get('e'))
|
||||||
|
|
||||||
|
def do_fname(self,elm):
|
||||||
|
"""
|
||||||
|
the func name
|
||||||
|
"""
|
||||||
|
latex_chars = []
|
||||||
|
for stag,t,e in self.process_children_list(elm):
|
||||||
|
if stag == 'r':
|
||||||
|
if FUNC.get(t):
|
||||||
|
latex_chars.append(FUNC[t])
|
||||||
|
else :
|
||||||
|
raise NotImplemented("Not support func %s" % t)
|
||||||
|
else:
|
||||||
|
latex_chars.append(t)
|
||||||
|
t = BLANK.join(latex_chars)
|
||||||
|
return t if FUNC_PLACE in t else t+FUNC_PLACE #do_func will replace this
|
||||||
|
|
||||||
|
def do_groupchr(self,elm):
|
||||||
|
"""
|
||||||
|
the Group-Character object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict['groupChrPr']
|
||||||
|
latex_s = get_val(pr.chr)
|
||||||
|
return pr.text+latex_s.format(c_dict['e'])
|
||||||
|
|
||||||
|
def do_rad(self,elm):
|
||||||
|
"""
|
||||||
|
the radical object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
text = c_dict.get('e')
|
||||||
|
deg_text = c_dict.get('deg')
|
||||||
|
if deg_text:
|
||||||
|
return RAD.format(deg=deg_text,text=text)
|
||||||
|
else:
|
||||||
|
return RAD_DEFAULT.format(text=text)
|
||||||
|
|
||||||
|
def do_eqarr(self,elm):
|
||||||
|
"""
|
||||||
|
the Array object
|
||||||
|
"""
|
||||||
|
return ARR.format(text=BRK.join(
|
||||||
|
[t for stag,t,e in self.process_children_list(elm,include=('e',))]))
|
||||||
|
|
||||||
|
|
||||||
|
def do_limlow(self,elm):
|
||||||
|
"""
|
||||||
|
the Lower-Limit object
|
||||||
|
"""
|
||||||
|
t_dict = self.process_children_dict(elm,include=('e','lim'))
|
||||||
|
latex_s = LIM_FUNC.get(t_dict['e'])
|
||||||
|
if not latex_s :
|
||||||
|
raise NotImplemented("Not support lim %s" % t_dict['e'])
|
||||||
|
else:
|
||||||
|
return latex_s.format(lim=t_dict.get('lim'))
|
||||||
|
|
||||||
|
def do_limupp(self,elm):
|
||||||
|
"""
|
||||||
|
the Upper-Limit object
|
||||||
|
"""
|
||||||
|
t_dict = self.process_children_dict(elm,include=('e','lim'))
|
||||||
|
return LIM_UPP.format(lim=t_dict.get('lim'),text=t_dict.get('e'))
|
||||||
|
|
||||||
|
def do_lim(self,elm):
|
||||||
|
"""
|
||||||
|
the lower limit of the limLow object and the upper limit of the limUpp function
|
||||||
|
"""
|
||||||
|
return self.process_children(elm).replace(LIM_TO[0],LIM_TO[1])
|
||||||
|
|
||||||
|
def do_m(self,elm):
|
||||||
|
"""
|
||||||
|
the Matrix object
|
||||||
|
"""
|
||||||
|
rows = []
|
||||||
|
for stag,t,e in self.process_children_list(elm):
|
||||||
|
if stag is 'mPr':
|
||||||
|
pass
|
||||||
|
elif stag == 'mr':
|
||||||
|
rows.append(t)
|
||||||
|
return M.format(text=BRK.join(rows))
|
||||||
|
|
||||||
|
def do_mr(self,elm):
|
||||||
|
"""
|
||||||
|
a single row of the matrix m
|
||||||
|
"""
|
||||||
|
return ALN.join(
|
||||||
|
[t for stag,t,e in self.process_children_list(elm,include=('e',))])
|
||||||
|
|
||||||
|
def do_nary(self,elm):
|
||||||
|
"""
|
||||||
|
the n-ary object
|
||||||
|
"""
|
||||||
|
res = []
|
||||||
|
bo = ''
|
||||||
|
for stag,t,e in self.process_children_list(elm):
|
||||||
|
if stag == 'naryPr':
|
||||||
|
bo = get_val(t.chr,store=CHR_BO)
|
||||||
|
else :
|
||||||
|
res.append(t)
|
||||||
|
return bo+BLANK.join(res)
|
||||||
|
|
||||||
|
def do_r(self,elm):
|
||||||
|
"""
|
||||||
|
Get text from 'r' element,And try convert them to latex symbols
|
||||||
|
@todo text style support , (sty)
|
||||||
|
@todo \text (latex pure text support)
|
||||||
|
"""
|
||||||
|
_str = []
|
||||||
|
for s in elm.findtext('./{0}t'.format(OMML_NS)):
|
||||||
|
#s = s if isinstance(s,unicode) else unicode(s,'utf-8')
|
||||||
|
_str.append(self._t_dict.get(s,s))
|
||||||
|
return escape_latex(BLANK.join(_str))
|
||||||
|
|
||||||
|
tag2meth={
|
||||||
|
'acc' : do_acc,
|
||||||
|
'r' : do_r,
|
||||||
|
'bar' : do_bar,
|
||||||
|
'sub' : do_sub,
|
||||||
|
'sup' : do_sup,
|
||||||
|
'f' : do_f,
|
||||||
|
'func': do_func,
|
||||||
|
'fName' : do_fname,
|
||||||
|
'groupChr' : do_groupchr,
|
||||||
|
'd' : do_d,
|
||||||
|
'rad' : do_rad,
|
||||||
|
'eqArr' : do_eqarr,
|
||||||
|
'limLow' : do_limlow,
|
||||||
|
'limUpp' : do_limupp,
|
||||||
|
'lim' : do_lim,
|
||||||
|
'm' : do_m,
|
||||||
|
'mr' : do_mr,
|
||||||
|
'nary' : do_nary,
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,156 @@
|
||||||
|
import zipfile
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import BinaryIO
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from .math.omml import OMML_NS, oMath2Latex
|
||||||
|
|
||||||
|
MATH_ROOT_TEMPLATE = "".join(
|
||||||
|
(
|
||||||
|
"<w:document ",
|
||||||
|
'xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ',
|
||||||
|
'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ',
|
||||||
|
'xmlns:o="urn:schemas-microsoft-com:office:office" ',
|
||||||
|
'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ',
|
||||||
|
'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ',
|
||||||
|
'xmlns:v="urn:schemas-microsoft-com:vml" ',
|
||||||
|
'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ',
|
||||||
|
'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ',
|
||||||
|
'xmlns:w10="urn:schemas-microsoft-com:office:word" ',
|
||||||
|
'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ',
|
||||||
|
'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ',
|
||||||
|
'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ',
|
||||||
|
'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ',
|
||||||
|
'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ',
|
||||||
|
'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">',
|
||||||
|
"{0}</w:document>",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_omath_to_latex(tag: Tag) -> str:
|
||||||
|
"""
|
||||||
|
Converts an OMML (Office Math Markup Language) tag to LaTeX format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag (Tag): A BeautifulSoup Tag object representing the OMML element.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The LaTeX representation of the OMML element.
|
||||||
|
"""
|
||||||
|
# Format the tag into a complete XML document string
|
||||||
|
math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag)))
|
||||||
|
# Find the 'oMath' element within the XML document
|
||||||
|
math_element = math_root.find(OMML_NS + "oMath")
|
||||||
|
# Convert the 'oMath' element to LaTeX using the oMath2Latex function
|
||||||
|
latex = oMath2Latex(math_element).latex
|
||||||
|
return latex
|
||||||
|
|
||||||
|
|
||||||
|
def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag:
|
||||||
|
"""
|
||||||
|
Creates a replacement tag for an OMML (Office Math Markup Language) element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag (Tag): A BeautifulSoup Tag object representing the "oMath" element.
|
||||||
|
block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tag: A BeautifulSoup Tag object representing the replacement element.
|
||||||
|
"""
|
||||||
|
t_tag = Tag(name="w:t")
|
||||||
|
t_tag.string = (
|
||||||
|
f"$${_convert_omath_to_latex(tag)}$$"
|
||||||
|
if block
|
||||||
|
else f"${_convert_omath_to_latex(tag)}$"
|
||||||
|
)
|
||||||
|
r_tag = Tag(name="w:r")
|
||||||
|
r_tag.append(t_tag)
|
||||||
|
return r_tag
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_equations(tag: Tag):
|
||||||
|
"""
|
||||||
|
Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath".
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the tag is not supported.
|
||||||
|
"""
|
||||||
|
if tag.name == "oMathPara":
|
||||||
|
# Create a new paragraph tag
|
||||||
|
p_tag = Tag(name="w:p")
|
||||||
|
# Replace each 'oMath' child tag with its LaTeX equivalent as block equations
|
||||||
|
for child_tag in tag.find_all("oMath"):
|
||||||
|
p_tag.append(_get_omath_tag_replacement(child_tag, block=True))
|
||||||
|
# Replace the original 'oMathPara' tag with the new paragraph tag
|
||||||
|
tag.replace_with(p_tag)
|
||||||
|
elif tag.name == "oMath":
|
||||||
|
# Replace the 'oMath' tag with its LaTeX equivalent as inline equation
|
||||||
|
tag.replace_with(_get_omath_tag_replacement(tag, block=False))
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Not supported tag: {tag.name}")
|
||||||
|
|
||||||
|
|
||||||
|
def _pre_process_math(content: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
|
||||||
|
This preprocessed content can be directly replaced in the DOCX file -> XMLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content (bytes): The XML content of the DOCX file as bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(content.decode(), features="xml")
|
||||||
|
for tag in soup.find_all("oMathPara"):
|
||||||
|
_replace_equations(tag)
|
||||||
|
for tag in soup.find_all("oMath"):
|
||||||
|
_replace_equations(tag)
|
||||||
|
return str(soup).encode()
|
||||||
|
|
||||||
|
|
||||||
|
def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
|
||||||
|
"""
|
||||||
|
Pre-processes a DOCX file with provided steps.
|
||||||
|
|
||||||
|
The process works by unzipping the DOCX file in memory, transforming specific XML files
|
||||||
|
(such as converting OMML elements to LaTeX), and then zipping everything back into a
|
||||||
|
DOCX file without writing to disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_docx (BinaryIO): A binary input stream representing the DOCX file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BinaryIO: A binary output stream representing the processed DOCX file.
|
||||||
|
"""
|
||||||
|
output_docx = BytesIO()
|
||||||
|
# The files that need to be pre-processed from .docx
|
||||||
|
pre_process_enable_files = [
|
||||||
|
"word/document.xml",
|
||||||
|
"word/footnotes.xml",
|
||||||
|
"word/endnotes.xml",
|
||||||
|
]
|
||||||
|
with zipfile.ZipFile(input_docx, mode="r") as zip_input:
|
||||||
|
files = {name: zip_input.read(name) for name in zip_input.namelist()}
|
||||||
|
with zipfile.ZipFile(output_docx, mode="w") as zip_output:
|
||||||
|
zip_output.comment = zip_input.comment
|
||||||
|
for name, content in files.items():
|
||||||
|
if name in pre_process_enable_files:
|
||||||
|
try:
|
||||||
|
# Pre-process the content
|
||||||
|
updated_content = _pre_process_math(content)
|
||||||
|
# In the future, if there are more pre-processing steps, they can be added here
|
||||||
|
zip_output.writestr(name, updated_content)
|
||||||
|
except:
|
||||||
|
# If there is an error in processing the content, write the original content
|
||||||
|
zip_output.writestr(name, content)
|
||||||
|
else:
|
||||||
|
zip_output.writestr(name, content)
|
||||||
|
output_docx.seek(0)
|
||||||
|
return output_docx
|
||||||
|
|
@ -3,6 +3,7 @@ import sys
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
@ -72,6 +73,7 @@ class DocxConverter(HtmlConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
|
pre_process_stream = pre_process_docx(file_stream)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
|
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue