refactor: reformatted with black

This commit is contained in:
Sathindu Ganhala Arachchige 2025-03-28 16:24:27 -04:00
parent 6a66b275bb
commit 799a1caf97
4 changed files with 559 additions and 519 deletions

View file

@ -7,269 +7,267 @@ On 25/03/2025
from __future__ import unicode_literals from __future__ import unicode_literals
CHARS = ('{','}', '_', '^', '#', '&', '$', '%', '~') CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = '' BLANK = ""
BACKSLASH = '\\' BACKSLASH = "\\"
ALN = '&' ALN = "&"
CHR = { CHR = {
#Unicode : Latex Math Symbols # Unicode : Latex Math Symbols
#Top accents # Top accents
'\u0300' : '\\grave{{{0}}}', "\u0300": "\\grave{{{0}}}",
'\u0301' : '\\acute{{{0}}}', "\u0301": "\\acute{{{0}}}",
'\u0302' : '\\hat{{{0}}}', "\u0302": "\\hat{{{0}}}",
'\u0303' : '\\tilde{{{0}}}', "\u0303": "\\tilde{{{0}}}",
'\u0304' : '\\bar{{{0}}}', "\u0304": "\\bar{{{0}}}",
'\u0305' : '\\overbar{{{0}}}', "\u0305": "\\overbar{{{0}}}",
'\u0306' : '\\breve{{{0}}}', "\u0306": "\\breve{{{0}}}",
'\u0307' : '\\dot{{{0}}}', "\u0307": "\\dot{{{0}}}",
'\u0308' : '\\ddot{{{0}}}', "\u0308": "\\ddot{{{0}}}",
'\u0309' : '\\ovhook{{{0}}}', "\u0309": "\\ovhook{{{0}}}",
'\u030a' : '\\ocirc{{{0}}}}', "\u030a": "\\ocirc{{{0}}}}",
'\u030c' : '\\check{{{0}}}}', "\u030c": "\\check{{{0}}}}",
'\u0310' : '\\candra{{{0}}}', "\u0310": "\\candra{{{0}}}",
'\u0312' : '\\oturnedcomma{{{0}}}', "\u0312": "\\oturnedcomma{{{0}}}",
'\u0315' : '\\ocommatopright{{{0}}}', "\u0315": "\\ocommatopright{{{0}}}",
'\u031a' : '\\droang{{{0}}}', "\u031a": "\\droang{{{0}}}",
'\u0338' : '\\not{{{0}}}', "\u0338": "\\not{{{0}}}",
'\u20d0' : '\\leftharpoonaccent{{{0}}}', "\u20d0": "\\leftharpoonaccent{{{0}}}",
'\u20d1' : '\\rightharpoonaccent{{{0}}}', "\u20d1": "\\rightharpoonaccent{{{0}}}",
'\u20d2' : '\\vertoverlay{{{0}}}', "\u20d2": "\\vertoverlay{{{0}}}",
'\u20d6' : '\\overleftarrow{{{0}}}', "\u20d6": "\\overleftarrow{{{0}}}",
'\u20d7' : '\\vec{{{0}}}', "\u20d7": "\\vec{{{0}}}",
'\u20db' : '\\dddot{{{0}}}', "\u20db": "\\dddot{{{0}}}",
'\u20dc' : '\\ddddot{{{0}}}', "\u20dc": "\\ddddot{{{0}}}",
'\u20e1' : '\\overleftrightarrow{{{0}}}', "\u20e1": "\\overleftrightarrow{{{0}}}",
'\u20e7' : '\\annuity{{{0}}}', "\u20e7": "\\annuity{{{0}}}",
'\u20e9' : '\\widebridgeabove{{{0}}}', "\u20e9": "\\widebridgeabove{{{0}}}",
'\u20f0' : '\\asteraccent{{{0}}}', "\u20f0": "\\asteraccent{{{0}}}",
#Bottom accents # Bottom accents
'\u0330' : '\\wideutilde{{{0}}}', "\u0330": "\\wideutilde{{{0}}}",
'\u0331' : '\\underbar{{{0}}}', "\u0331": "\\underbar{{{0}}}",
'\u20e8' : '\\threeunderdot{{{0}}}', "\u20e8": "\\threeunderdot{{{0}}}",
'\u20ec' : '\\underrightharpoondown{{{0}}}', "\u20ec": "\\underrightharpoondown{{{0}}}",
'\u20ed' : '\\underleftharpoondown{{{0}}}', "\u20ed": "\\underleftharpoondown{{{0}}}",
'\u20ee' : '\\underledtarrow{{{0}}}', "\u20ee": "\\underledtarrow{{{0}}}",
'\u20ef' : '\\underrightarrow{{{0}}}', "\u20ef": "\\underrightarrow{{{0}}}",
#Over | group # Over | group
'\u23b4' : '\\overbracket{{{0}}}', "\u23b4": "\\overbracket{{{0}}}",
'\u23dc' : '\\overparen{{{0}}}', "\u23dc": "\\overparen{{{0}}}",
'\u23de' : '\\overbrace{{{0}}}', "\u23de": "\\overbrace{{{0}}}",
#Under| group # Under| group
'\u23b5' : '\\underbracket{{{0}}}', "\u23b5": "\\underbracket{{{0}}}",
'\u23dd' : '\\underparen{{{0}}}', "\u23dd": "\\underparen{{{0}}}",
'\u23df' : '\\underbrace{{{0}}}', "\u23df": "\\underbrace{{{0}}}",
} }
CHR_BO = { CHR_BO = {
#Big operators, # Big operators,
'\u2140' : '\\Bbbsum', "\u2140": "\\Bbbsum",
'\u220f' : '\\prod', "\u220f": "\\prod",
'\u2210' : '\\coprod', "\u2210": "\\coprod",
'\u2211' : '\\sum', "\u2211": "\\sum",
'\u222b' : '\\int', "\u222b": "\\int",
'\u22c0' : '\\bigwedge', "\u22c0": "\\bigwedge",
'\u22c1' : '\\bigvee', "\u22c1": "\\bigvee",
'\u22c2' : '\\bigcap', "\u22c2": "\\bigcap",
'\u22c3' : '\\bigcup', "\u22c3": "\\bigcup",
'\u2a00' : '\\bigodot', "\u2a00": "\\bigodot",
'\u2a01' : '\\bigoplus', "\u2a01": "\\bigoplus",
'\u2a02' : '\\bigotimes', "\u2a02": "\\bigotimes",
} }
T = { T = {
"\u2192": "\\rightarrow ",
'\u2192' : '\\rightarrow ', # Greek letters
#Greek letters "\U0001d6fc": "\\alpha ",
'\U0001d6fc' : '\\alpha ', "\U0001d6fd": "\\beta ",
'\U0001d6fd' : '\\beta ', "\U0001d6fe": "\\gamma ",
'\U0001d6fe' : '\\gamma ', "\U0001d6ff": "\\theta ",
'\U0001d6ff' : '\\theta ', "\U0001d700": "\\epsilon ",
'\U0001d700' : '\\epsilon ', "\U0001d701": "\\zeta ",
'\U0001d701' : '\\zeta ', "\U0001d702": "\\eta ",
'\U0001d702' : '\\eta ', "\U0001d703": "\\theta ",
'\U0001d703' : '\\theta ', "\U0001d704": "\\iota ",
'\U0001d704' : '\\iota ', "\U0001d705": "\\kappa ",
'\U0001d705' : '\\kappa ', "\U0001d706": "\\lambda ",
'\U0001d706' : '\\lambda ', "\U0001d707": "\\m ",
'\U0001d707' : '\\m ', "\U0001d708": "\\n ",
'\U0001d708' : '\\n ', "\U0001d709": "\\xi ",
'\U0001d709' : '\\xi ', "\U0001d70a": "\\omicron ",
'\U0001d70a' : '\\omicron ', "\U0001d70b": "\\pi ",
'\U0001d70b' : '\\pi ', "\U0001d70c": "\\rho ",
'\U0001d70c' : '\\rho ', "\U0001d70d": "\\varsigma ",
'\U0001d70d' : '\\varsigma ', "\U0001d70e": "\\sigma ",
'\U0001d70e' : '\\sigma ', "\U0001d70f": "\\ta ",
'\U0001d70f' : '\\ta ', "\U0001d710": "\\upsilon ",
'\U0001d710' : '\\upsilon ', "\U0001d711": "\\phi ",
'\U0001d711' : '\\phi ', "\U0001d712": "\\chi ",
'\U0001d712' : '\\chi ', "\U0001d713": "\\psi ",
'\U0001d713' : '\\psi ', "\U0001d714": "\\omega ",
'\U0001d714' : '\\omega ', "\U0001d715": "\\partial ",
'\U0001d715' : '\\partial ', "\U0001d716": "\\varepsilon ",
'\U0001d716' : '\\varepsilon ', "\U0001d717": "\\vartheta ",
'\U0001d717' : '\\vartheta ', "\U0001d718": "\\varkappa ",
'\U0001d718' : '\\varkappa ', "\U0001d719": "\\varphi ",
'\U0001d719' : '\\varphi ', "\U0001d71a": "\\varrho ",
'\U0001d71a' : '\\varrho ', "\U0001d71b": "\\varpi ",
'\U0001d71b' : '\\varpi ', # Relation symbols
#Relation symbols "\u2190": "\\leftarrow ",
'\u2190' : '\\leftarrow ', "\u2191": "\\uparrow ",
'\u2191' : '\\uparrow ', "\u2192": "\\rightarrow ",
'\u2192' : '\\rightarrow ', "\u2193": "\\downright ",
'\u2193' : '\\downright ', "\u2194": "\\leftrightarrow ",
'\u2194' : '\\leftrightarrow ', "\u2195": "\\updownarrow ",
'\u2195' : '\\updownarrow ', "\u2196": "\\nwarrow ",
'\u2196' : '\\nwarrow ', "\u2197": "\\nearrow ",
'\u2197' : '\\nearrow ', "\u2198": "\\searrow ",
'\u2198' : '\\searrow ', "\u2199": "\\swarrow ",
'\u2199' : '\\swarrow ', "\u22ee": "\\vdots ",
'\u22ee' : '\\vdots ', "\u22ef": "\\cdots ",
'\u22ef' : '\\cdots ', "\u22f0": "\\adots ",
'\u22f0' : '\\adots ', "\u22f1": "\\ddots ",
'\u22f1' : '\\ddots ', "\u2260": "\\ne ",
'\u2260' : '\\ne ', "\u2264": "\\leq ",
'\u2264' : '\\leq ', "\u2265": "\\geq ",
'\u2265' : '\\geq ', "\u2266": "\\leqq ",
'\u2266' : '\\leqq ', "\u2267": "\\geqq ",
'\u2267' : '\\geqq ', "\u2268": "\\lneqq ",
'\u2268' : '\\lneqq ', "\u2269": "\\gneqq ",
'\u2269' : '\\gneqq ', "\u226a": "\\ll ",
'\u226a' : '\\ll ', "\u226b": "\\gg ",
'\u226b' : '\\gg ', "\u2208": "\\in ",
'\u2208' : '\\in ', "\u2209": "\\notin ",
'\u2209' : '\\notin ', "\u220b": "\\ni ",
'\u220b' : '\\ni ', "\u220c": "\\nni ",
'\u220c' : '\\nni ', # Ordinary symbols
"\u221e": "\\infty ",
#Ordinary symbols # Binary relations
'\u221e' : '\\infty ', "\u00b1": "\\pm ",
#Binary relations "\u2213": "\\mp ",
'\u00b1' : '\\pm ', # Italic, Latin, uppercase
'\u2213' : '\\mp ', "\U0001d434": "A",
#Italic, Latin, uppercase "\U0001d435": "B",
'\U0001d434' : 'A', "\U0001d436": "C",
'\U0001d435' : 'B', "\U0001d437": "D",
'\U0001d436' : 'C', "\U0001d438": "E",
'\U0001d437' : 'D', "\U0001d439": "F",
'\U0001d438' : 'E', "\U0001d43a": "G",
'\U0001d439' : 'F', "\U0001d43b": "H",
'\U0001d43a' : 'G', "\U0001d43c": "I",
'\U0001d43b' : 'H', "\U0001d43d": "J",
'\U0001d43c' : 'I', "\U0001d43e": "K",
'\U0001d43d' : 'J', "\U0001d43f": "L",
'\U0001d43e' : 'K', "\U0001d440": "M",
'\U0001d43f' : 'L', "\U0001d441": "N",
'\U0001d440' : 'M', "\U0001d442": "O",
'\U0001d441' : 'N', "\U0001d443": "P",
'\U0001d442' : 'O', "\U0001d444": "Q",
'\U0001d443' : 'P', "\U0001d445": "R",
'\U0001d444' : 'Q', "\U0001d446": "S",
'\U0001d445' : 'R', "\U0001d447": "T",
'\U0001d446' : 'S', "\U0001d448": "U",
'\U0001d447' : 'T', "\U0001d449": "V",
'\U0001d448' : 'U', "\U0001d44a": "W",
'\U0001d449' : 'V', "\U0001d44b": "X",
'\U0001d44a' : 'W', "\U0001d44c": "Y",
'\U0001d44b' : 'X', "\U0001d44d": "Z",
'\U0001d44c' : 'Y', # Italic, Latin, lowercase
'\U0001d44d' : 'Z', "\U0001d44e": "a",
#Italic, Latin, lowercase "\U0001d44f": "b",
'\U0001d44e' : 'a', "\U0001d450": "c",
'\U0001d44f' : 'b', "\U0001d451": "d",
'\U0001d450' : 'c', "\U0001d452": "e",
'\U0001d451' : 'd', "\U0001d453": "f",
'\U0001d452' : 'e', "\U0001d454": "g",
'\U0001d453' : 'f', "\U0001d456": "i",
'\U0001d454' : 'g', "\U0001d457": "j",
'\U0001d456' : 'i', "\U0001d458": "k",
'\U0001d457' : 'j', "\U0001d459": "l",
'\U0001d458' : 'k', "\U0001d45a": "m",
'\U0001d459' : 'l', "\U0001d45b": "n",
'\U0001d45a' : 'm', "\U0001d45c": "o",
'\U0001d45b' : 'n', "\U0001d45d": "p",
'\U0001d45c' : 'o', "\U0001d45e": "q",
'\U0001d45d' : 'p', "\U0001d45f": "r",
'\U0001d45e' : 'q', "\U0001d460": "s",
'\U0001d45f' : 'r', "\U0001d461": "t",
'\U0001d460' : 's', "\U0001d462": "u",
'\U0001d461' : 't', "\U0001d463": "v",
'\U0001d462' : 'u', "\U0001d464": "w",
'\U0001d463' : 'v', "\U0001d465": "x",
'\U0001d464' : 'w', "\U0001d466": "y",
'\U0001d465' : 'x', "\U0001d467": "z",
'\U0001d466' : 'y',
'\U0001d467' : 'z',
} }
FUNC ={ FUNC = {
'sin' : '\\sin({fe})', "sin": "\\sin({fe})",
'cos' : '\\cos({fe})', "cos": "\\cos({fe})",
'tan' : '\\tan({fe})', "tan": "\\tan({fe})",
'arcsin' : '\\arcsin({fe})', "arcsin": "\\arcsin({fe})",
'arccos' : '\\arccos({fe})', "arccos": "\\arccos({fe})",
'arctan' : '\\arctan({fe})', "arctan": "\\arctan({fe})",
'arccot' : '\\arccot({fe})', "arccot": "\\arccot({fe})",
'sinh' : '\\sinh({fe})', "sinh": "\\sinh({fe})",
'cosh' : '\\cosh({fe})', "cosh": "\\cosh({fe})",
'tanh' : '\\tanh({fe})', "tanh": "\\tanh({fe})",
'coth' : '\\coth({fe})', "coth": "\\coth({fe})",
'sec' : '\\sec({fe})', "sec": "\\sec({fe})",
'csc' : '\\csc({fe})', "csc": "\\csc({fe})",
} }
FUNC_PLACE = '{fe}' FUNC_PLACE = "{fe}"
BRK = '\\\\' BRK = "\\\\"
CHR_DEFAULT = { CHR_DEFAULT = {
'ACC_VAL':'\\hat{{{0}}}', "ACC_VAL": "\\hat{{{0}}}",
} }
POS = { POS = {
'top' : '\\overline{{{0}}}', # not sure "top": "\\overline{{{0}}}", # not sure
'bot' : '\\underline{{{0}}}', "bot": "\\underline{{{0}}}",
} }
POS_DEFAULT = { POS_DEFAULT = {
'BAR_VAL': '\\overline{{{0}}}', "BAR_VAL": "\\overline{{{0}}}",
} }
SUB = '_{{{0}}}' SUB = "_{{{0}}}"
SUP = '^{{{0}}}' SUP = "^{{{0}}}"
F = { F = {
'bar': '\\frac{{{num}}}{{{den}}}', "bar": "\\frac{{{num}}}{{{den}}}",
'skw': r'^{{{num}}}/_{{{den}}}', "skw": r"^{{{num}}}/_{{{den}}}",
'noBar': '\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}', "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
'lin' : '{{{num}}}/{{{den}}}', "lin": "{{{num}}}/{{{den}}}",
} }
F_DEFAULT = '\\frac{{{num}}}{{{den}}}' F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
D = '\\left{left}{text}\\right{right}' D = "\\left{left}{text}\\right{right}"
D_DEFAULT = { D_DEFAULT = {
'left':'(', "left": "(",
'right':')', "right": ")",
'null':'.', "null": ".",
} }
RAD = '\\sqrt[{deg}]{{{text}}}' RAD = "\\sqrt[{deg}]{{{text}}}"
RAD_DEFAULT = '\\sqrt{{{text}}}' RAD_DEFAULT = "\\sqrt{{{text}}}"
ARR = '\\begin{{array}}{{c}}{text}\end{{array}}' ARR = "\\begin{{array}}{{c}}{text}\end{{array}}"
LIM_FUNC = { LIM_FUNC = {
'lim':'\\lim_{{{lim}}}', "lim": "\\lim_{{{lim}}}",
'max':'\\max_{{{lim}}}', "max": "\\max_{{{lim}}}",
'min':'\\min_{{{lim}}}', "min": "\\min_{{{lim}}}",
} }
LIM_TO = ('\\rightarrow','\\to') LIM_TO = ("\\rightarrow", "\\to")
LIM_UPP = '\\overset{{{lim}}}{{{text}}}' LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
M = '\\begin{{matrix}}{text}\end{{matrix}}' M = "\\begin{{matrix}}{text}\end{{matrix}}"

View file

@ -8,355 +8,395 @@ On 25/03/2025
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from .latex_dict import (CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT from .latex_dict import (
, SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR CHARS,
, LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE) CHR,
CHR_BO,
CHR_DEFAULT,
POS,
POS_DEFAULT,
SUB,
SUP,
F,
F_DEFAULT,
T,
FUNC,
D,
D_DEFAULT,
RAD,
RAD_DEFAULT,
ARR,
LIM_FUNC,
LIM_TO,
LIM_UPP,
M,
BRK,
BLANK,
BACKSLASH,
ALN,
FUNC_PLACE,
)
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
def load(stream): def load(stream):
tree = ET.parse(stream) tree = ET.parse(stream)
for omath in tree.findall(OMML_NS+'oMath'): for omath in tree.findall(OMML_NS + "oMath"):
yield oMath2Latex(omath) yield oMath2Latex(omath)
def load_string(string): def load_string(string):
root = ET.fromstring(string) root = ET.fromstring(string)
for omath in root.findall(OMML_NS+'oMath'): for omath in root.findall(OMML_NS + "oMath"):
yield oMath2Latex(omath) yield oMath2Latex(omath)
def escape_latex(strs): def escape_latex(strs):
last = None last = None
new_chr = [] new_chr = []
strs = strs.replace(r'\\','\\') strs = strs.replace(r"\\", "\\")
for c in strs : for c in strs:
if (c in CHARS) and (last !=BACKSLASH): if (c in CHARS) and (last != BACKSLASH):
new_chr.append(BACKSLASH+c) new_chr.append(BACKSLASH + c)
else: else:
new_chr.append(c) new_chr.append(c)
last = c last = c
return BLANK.join(new_chr) return BLANK.join(new_chr)
def get_val(key,default=None,store=CHR):
if key is not None: def get_val(key, default=None, store=CHR):
return key if not store else store.get(key,key) if key is not None:
else: return key if not store else store.get(key, key)
return default else:
return default
class Tag2Method(object): class Tag2Method(object):
def call_method(self,elm,stag=None): def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get getmethod = self.tag2meth.get
if stag is None: if stag is None:
stag = elm.tag.replace(OMML_NS,'') stag = elm.tag.replace(OMML_NS, "")
method = getmethod(stag) method = getmethod(stag)
if method: if method:
return method(self,elm) return method(self, elm)
else: else:
return None return None
def process_children_list(self,elm,include=None): def process_children_list(self, elm, include=None):
""" """
process children of the elm,return iterable process children of the elm,return iterable
""" """
for _e in list(elm): for _e in list(elm):
if (OMML_NS not in _e.tag): if OMML_NS not in _e.tag:
continue continue
stag = _e.tag.replace(OMML_NS,'') stag = _e.tag.replace(OMML_NS, "")
if include and (stag not in include): if include and (stag not in include):
continue continue
t = self.call_method(_e,stag=stag) t = self.call_method(_e, stag=stag)
if t is None: if t is None:
t = self.process_unknow(_e,stag) t = self.process_unknow(_e, stag)
if t is None: if t is None:
continue continue
yield (stag,t,_e) yield (stag, t, _e)
def process_children_dict(self,elm,include=None): def process_children_dict(self, elm, include=None):
""" """
process children of the elm,return dict process children of the elm,return dict
""" """
latex_chars = dict() latex_chars = dict()
for stag,t,e in self.process_children_list(elm,include): for stag, t, e in self.process_children_list(elm, include):
latex_chars[stag] = t latex_chars[stag] = t
return latex_chars return latex_chars
def process_children(self,elm,include=None): def process_children(self, elm, include=None):
""" """
process children of the elm,return string process children of the elm,return string
""" """
return BLANK.join(( t if not isinstance(t,Tag2Method) else str(t) return BLANK.join(
for stag,t,e in self.process_children_list(elm,include))) (
t if not isinstance(t, Tag2Method) else str(t)
for stag, t, e in self.process_children_list(elm, include)
)
)
def process_unknow(self,elm,stag): def process_unknow(self, elm, stag):
return None return None
class Pr(Tag2Method): class Pr(Tag2Method):
text = '' text = ""
__val_tags = ('chr','pos','begChr','endChr','type') __val_tags = ("chr", "pos", "begChr", "endChr", "type")
__innerdict= None #can't use the __dict__ __innerdict = None # can't use the __dict__
""" common properties of element""" """ common properties of element"""
def __init__(self, elm):
self.__innerdict={}
self.text=self.process_children(elm)
def __str__(self): def __init__(self, elm):
return self.text self.__innerdict = {}
self.text = self.process_children(elm)
def __unicode__(self): def __str__(self):
return self.__str__(self) return self.text
def __getattr__(self,name): def __unicode__(self):
return self.__innerdict.get(name,None) return self.__str__(self)
def do_brk(self,elm): def __getattr__(self, name):
self.__innerdict['brk'] = BRK return self.__innerdict.get(name, None)
return BRK
def do_common(self,elm): def do_brk(self, elm):
stag = elm.tag.replace(OMML_NS,'') self.__innerdict["brk"] = BRK
if stag in self.__val_tags: return BRK
t = elm.get('{0}val'.format(OMML_NS))
self.__innerdict[stag] = t
return None
tag2meth = { def do_common(self, elm):
'brk':do_brk, stag = elm.tag.replace(OMML_NS, "")
'chr':do_common, if stag in self.__val_tags:
'pos':do_common, t = elm.get("{0}val".format(OMML_NS))
'begChr':do_common, self.__innerdict[stag] = t
'endChr':do_common, return None
'type':do_common,
} tag2meth = {
"brk": do_brk,
"chr": do_common,
"pos": do_common,
"begChr": do_common,
"endChr": do_common,
"type": do_common,
}
class oMath2Latex(Tag2Method): class oMath2Latex(Tag2Method):
""" """
Convert oMath element of omml to latex Convert oMath element of omml to latex
""" """
_t_dict = T
__direct_tags = ('box','sSub','sSup','sSubSup','num','den','deg','e') _t_dict = T
def __init__(self, element): __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
self._latex = self.process_children(element)
def __str__(self): def __init__(self, element):
return self.latex self._latex = self.process_children(element)
def __unicode__(self): def __str__(self):
return self.__str__(self) return self.latex
def process_unknow(self,elm,stag): def __unicode__(self):
if stag in self.__direct_tags: return self.__str__(self)
return self.process_children(elm)
elif stag[-2:] == 'Pr':
return Pr(elm)
else:
return None
@property def process_unknow(self, elm, stag):
def latex(self): if stag in self.__direct_tags:
return self._latex return self.process_children(elm)
elif stag[-2:] == "Pr":
return Pr(elm)
else:
return None
def do_acc(self,elm): @property
""" def latex(self):
the accent function return self._latex
"""
c_dict = self.process_children_dict(elm)
latex_s = get_val(c_dict['accPr'].chr,default=CHR_DEFAULT.get('ACC_VAL'),store=CHR)
return latex_s.format(c_dict['e'])
def do_bar(self,elm): def do_acc(self, elm):
""" """
the bar function the accent function
""" """
c_dict = self.process_children_dict(elm) c_dict = self.process_children_dict(elm)
pr = c_dict['barPr'] latex_s = get_val(
latex_s = get_val(pr.pos,default=POS_DEFAULT.get('BAR_VAL'),store=POS) c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
return pr.text+latex_s.format(c_dict['e']) )
return latex_s.format(c_dict["e"])
def do_d(self,elm): def do_bar(self, elm):
""" """
the delimiter object the bar function
""" """
c_dict = self.process_children_dict(elm) c_dict = self.process_children_dict(elm)
pr = c_dict['dPr'] pr = c_dict["barPr"]
null = D_DEFAULT.get('null') latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
s_val = get_val(pr.begChr,default=D_DEFAULT.get('left'),store=T) return pr.text + latex_s.format(c_dict["e"])
e_val = get_val(pr.endChr,default=D_DEFAULT.get('right'),store=T)
return pr.text+D.format(left= null if not s_val else escape_latex(s_val),
text=c_dict['e'],
right= null if not e_val else escape_latex(e_val))
def do_d(self, elm):
"""
the delimiter object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict["dPr"]
null = D_DEFAULT.get("null")
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
return pr.text + D.format(
left=null if not s_val else escape_latex(s_val),
text=c_dict["e"],
right=null if not e_val else escape_latex(e_val),
)
def do_spre(self,elm): def do_spre(self, elm):
""" """
the Pre-Sub-Superscript object -- Not support yet the Pre-Sub-Superscript object -- Not support yet
""" """
pass pass
def do_sub(self,elm): def do_sub(self, elm):
text = self.process_children(elm) text = self.process_children(elm)
return SUB.format(text) return SUB.format(text)
def do_sup(self,elm): def do_sup(self, elm):
text = self.process_children(elm) text = self.process_children(elm)
return SUP.format(text) return SUP.format(text)
def do_f(self,elm): def do_f(self, elm):
""" """
the fraction object the fraction object
""" """
c_dict = self.process_children_dict(elm) c_dict = self.process_children_dict(elm)
pr = c_dict['fPr'] pr = c_dict["fPr"]
latex_s = get_val(pr.type,default=F_DEFAULT,store=F) latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
return pr.text+latex_s.format(num=c_dict.get('num'),den=c_dict.get('den')) return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
def do_func(self,elm): def do_func(self, elm):
""" """
the Function-Apply object (Examples:sin cos) the Function-Apply object (Examples:sin cos)
""" """
c_dict = self.process_children_dict(elm) c_dict = self.process_children_dict(elm)
func_name = c_dict.get('fName') func_name = c_dict.get("fName")
return func_name.replace(FUNC_PLACE,c_dict.get('e')) return func_name.replace(FUNC_PLACE, c_dict.get("e"))
def do_fname(self,elm): def do_fname(self, elm):
""" """
the func name the func name
""" """
latex_chars = [] latex_chars = []
for stag,t,e in self.process_children_list(elm): for stag, t, e in self.process_children_list(elm):
if stag == 'r': if stag == "r":
if FUNC.get(t): if FUNC.get(t):
latex_chars.append(FUNC[t]) latex_chars.append(FUNC[t])
else : else:
raise NotImplemented("Not support func %s" % t) raise NotImplemented("Not support func %s" % t)
else: else:
latex_chars.append(t) latex_chars.append(t)
t = BLANK.join(latex_chars) t = BLANK.join(latex_chars)
return t if FUNC_PLACE in t else t+FUNC_PLACE #do_func will replace this return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
def do_groupchr(self,elm): def do_groupchr(self, elm):
""" """
the Group-Character object the Group-Character object
""" """
c_dict = self.process_children_dict(elm) c_dict = self.process_children_dict(elm)
pr = c_dict['groupChrPr'] pr = c_dict["groupChrPr"]
latex_s = get_val(pr.chr) latex_s = get_val(pr.chr)
return pr.text+latex_s.format(c_dict['e']) return pr.text + latex_s.format(c_dict["e"])
def do_rad(self,elm): def do_rad(self, elm):
""" """
the radical object the radical object
""" """
c_dict = self.process_children_dict(elm) c_dict = self.process_children_dict(elm)
text = c_dict.get('e') text = c_dict.get("e")
deg_text = c_dict.get('deg') deg_text = c_dict.get("deg")
if deg_text: if deg_text:
return RAD.format(deg=deg_text,text=text) return RAD.format(deg=deg_text, text=text)
else: else:
return RAD_DEFAULT.format(text=text) return RAD_DEFAULT.format(text=text)
def do_eqarr(self,elm):
"""
the Array object
"""
return ARR.format(text=BRK.join(
[t for stag,t,e in self.process_children_list(elm,include=('e',))]))
def do_eqarr(self, elm):
"""
the Array object
"""
return ARR.format(
text=BRK.join(
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
)
)
def do_limlow(self,elm): def do_limlow(self, elm):
""" """
the Lower-Limit object the Lower-Limit object
""" """
t_dict = self.process_children_dict(elm,include=('e','lim')) t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict['e']) latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s : if not latex_s:
raise NotImplemented("Not support lim %s" % t_dict['e']) raise NotImplemented("Not support lim %s" % t_dict["e"])
else: else:
return latex_s.format(lim=t_dict.get('lim')) return latex_s.format(lim=t_dict.get("lim"))
def do_limupp(self,elm): def do_limupp(self, elm):
""" """
the Upper-Limit object the Upper-Limit object
""" """
t_dict = self.process_children_dict(elm,include=('e','lim')) t_dict = self.process_children_dict(elm, include=("e", "lim"))
return LIM_UPP.format(lim=t_dict.get('lim'),text=t_dict.get('e')) return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
def do_lim(self,elm): def do_lim(self, elm):
""" """
the lower limit of the limLow object and the upper limit of the limUpp function the lower limit of the limLow object and the upper limit of the limUpp function
""" """
return self.process_children(elm).replace(LIM_TO[0],LIM_TO[1]) return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
def do_m(self,elm):
"""
the Matrix object
"""
rows = []
for stag,t,e in self.process_children_list(elm):
if stag is 'mPr':
pass
elif stag == 'mr':
rows.append(t)
return M.format(text=BRK.join(rows))
def do_mr(self,elm): def do_m(self, elm):
""" """
a single row of the matrix m the Matrix object
""" """
return ALN.join( rows = []
[t for stag,t,e in self.process_children_list(elm,include=('e',))]) for stag, t, e in self.process_children_list(elm):
if stag is "mPr":
pass
elif stag == "mr":
rows.append(t)
return M.format(text=BRK.join(rows))
def do_nary(self,elm): def do_mr(self, elm):
""" """
the n-ary object a single row of the matrix m
""" """
res = [] return ALN.join(
bo = '' [t for stag, t, e in self.process_children_list(elm, include=("e",))]
for stag,t,e in self.process_children_list(elm): )
if stag == 'naryPr':
bo = get_val(t.chr,store=CHR_BO)
else :
res.append(t)
return bo+BLANK.join(res)
def do_r(self,elm): def do_nary(self, elm):
""" """
Get text from 'r' element,And try convert them to latex symbols the n-ary object
@todo text style support , (sty) """
@todo \text (latex pure text support) res = []
""" bo = ""
_str = [] for stag, t, e in self.process_children_list(elm):
for s in elm.findtext('./{0}t'.format(OMML_NS)): if stag == "naryPr":
#s = s if isinstance(s,unicode) else unicode(s,'utf-8') bo = get_val(t.chr, store=CHR_BO)
_str.append(self._t_dict.get(s,s)) else:
return escape_latex(BLANK.join(_str)) res.append(t)
return bo + BLANK.join(res)
tag2meth={ def do_r(self, elm):
'acc' : do_acc, """
'r' : do_r, Get text from 'r' element,And try convert them to latex symbols
'bar' : do_bar, @todo text style support , (sty)
'sub' : do_sub, @todo \text (latex pure text support)
'sup' : do_sup, """
'f' : do_f, _str = []
'func': do_func, for s in elm.findtext("./{0}t".format(OMML_NS)):
'fName' : do_fname, # s = s if isinstance(s,unicode) else unicode(s,'utf-8')
'groupChr' : do_groupchr, _str.append(self._t_dict.get(s, s))
'd' : do_d, return escape_latex(BLANK.join(_str))
'rad' : do_rad,
'eqArr' : do_eqarr, tag2meth = {
'limLow' : do_limlow, "acc": do_acc,
'limUpp' : do_limupp, "r": do_r,
'lim' : do_lim, "bar": do_bar,
'm' : do_m, "sub": do_sub,
'mr' : do_mr, "sup": do_sup,
'nary' : do_nary, "f": do_f,
} "func": do_func,
"fName": do_fname,
"groupChr": do_groupchr,
"d": do_d,
"rad": do_rad,
"eqArr": do_eqarr,
"limLow": do_limlow,
"limUpp": do_limupp,
"lim": do_lim,
"m": do_m,
"mr": do_mr,
"nary": do_nary,
}

View file

@ -75,5 +75,6 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream) pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string( return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
) )

View file

@ -272,9 +272,10 @@ def test_docx_equations() -> None:
assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found"
# Find block equations wrapped with double $$ and check if they are present # Find block equations wrapped with double $$ and check if they are present
block_equations = re.findall(r'\$\$(.+?)\$\$', result.text_content) block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content)
assert block_equations, "No block equations found in the document." assert block_equations, "No block equations found in the document."
def test_input_as_strings() -> None: def test_input_as_strings() -> None:
markitdown = MarkItDown() markitdown = MarkItDown()