From 799a1caf9760a9b7dd0e93b729720b15a6058b92 Mon Sep 17 00:00:00 2001 From: Sathindu Ganhala Arachchige Date: Fri, 28 Mar 2025 16:24:27 -0400 Subject: [PATCH] refactor: reformatted with black --- .../converter_utils/docx/math/latex_dict.py | 448 +++++++------ .../converter_utils/docx/math/omml.py | 624 ++++++++++-------- .../markitdown/converters/_docx_converter.py | 3 +- packages/markitdown/tests/test_module_misc.py | 3 +- 4 files changed, 559 insertions(+), 519 deletions(-) diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py index 5b94bb4..3db84c1 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py @@ -7,269 +7,267 @@ On 25/03/2025 from __future__ import unicode_literals -CHARS = ('{','}', '_', '^', '#', '&', '$', '%', '~') +CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") -BLANK = '' -BACKSLASH = '\\' -ALN = '&' +BLANK = "" +BACKSLASH = "\\" +ALN = "&" CHR = { - #Unicode : Latex Math Symbols - #Top accents - '\u0300' : '\\grave{{{0}}}', - '\u0301' : '\\acute{{{0}}}', - '\u0302' : '\\hat{{{0}}}', - '\u0303' : '\\tilde{{{0}}}', - '\u0304' : '\\bar{{{0}}}', - '\u0305' : '\\overbar{{{0}}}', - '\u0306' : '\\breve{{{0}}}', - '\u0307' : '\\dot{{{0}}}', - '\u0308' : '\\ddot{{{0}}}', - '\u0309' : '\\ovhook{{{0}}}', - '\u030a' : '\\ocirc{{{0}}}}', - '\u030c' : '\\check{{{0}}}}', - '\u0310' : '\\candra{{{0}}}', - '\u0312' : '\\oturnedcomma{{{0}}}', - '\u0315' : '\\ocommatopright{{{0}}}', - '\u031a' : '\\droang{{{0}}}', - '\u0338' : '\\not{{{0}}}', - '\u20d0' : '\\leftharpoonaccent{{{0}}}', - '\u20d1' : '\\rightharpoonaccent{{{0}}}', - '\u20d2' : '\\vertoverlay{{{0}}}', - '\u20d6' : '\\overleftarrow{{{0}}}', - '\u20d7' : '\\vec{{{0}}}', - '\u20db' : '\\dddot{{{0}}}', - '\u20dc' : '\\ddddot{{{0}}}', - '\u20e1' : '\\overleftrightarrow{{{0}}}', - '\u20e7' : '\\annuity{{{0}}}', - '\u20e9' : '\\widebridgeabove{{{0}}}', - '\u20f0' : '\\asteraccent{{{0}}}', - #Bottom accents - '\u0330' : '\\wideutilde{{{0}}}', - '\u0331' : '\\underbar{{{0}}}', - '\u20e8' : '\\threeunderdot{{{0}}}', - '\u20ec' : '\\underrightharpoondown{{{0}}}', - '\u20ed' : '\\underleftharpoondown{{{0}}}', - '\u20ee' : '\\underledtarrow{{{0}}}', - '\u20ef' : '\\underrightarrow{{{0}}}', - #Over | group - '\u23b4' : '\\overbracket{{{0}}}', - '\u23dc' : '\\overparen{{{0}}}', - '\u23de' : '\\overbrace{{{0}}}', - #Under| group - '\u23b5' : '\\underbracket{{{0}}}', - '\u23dd' : '\\underparen{{{0}}}', - '\u23df' : '\\underbrace{{{0}}}', + # Unicode : Latex Math Symbols + # Top accents + "\u0300": "\\grave{{{0}}}", + "\u0301": "\\acute{{{0}}}", + "\u0302": "\\hat{{{0}}}", + "\u0303": "\\tilde{{{0}}}", + "\u0304": "\\bar{{{0}}}", + "\u0305": "\\overbar{{{0}}}", + "\u0306": "\\breve{{{0}}}", + "\u0307": "\\dot{{{0}}}", + "\u0308": "\\ddot{{{0}}}", + "\u0309": "\\ovhook{{{0}}}", + "\u030a": "\\ocirc{{{0}}}}", + "\u030c": "\\check{{{0}}}}", + "\u0310": "\\candra{{{0}}}", + "\u0312": "\\oturnedcomma{{{0}}}", + "\u0315": "\\ocommatopright{{{0}}}", + "\u031a": "\\droang{{{0}}}", + "\u0338": "\\not{{{0}}}", + "\u20d0": "\\leftharpoonaccent{{{0}}}", + "\u20d1": "\\rightharpoonaccent{{{0}}}", + "\u20d2": "\\vertoverlay{{{0}}}", + "\u20d6": "\\overleftarrow{{{0}}}", + "\u20d7": "\\vec{{{0}}}", + "\u20db": "\\dddot{{{0}}}", + "\u20dc": "\\ddddot{{{0}}}", + "\u20e1": "\\overleftrightarrow{{{0}}}", + "\u20e7": "\\annuity{{{0}}}", + "\u20e9": "\\widebridgeabove{{{0}}}", + "\u20f0": "\\asteraccent{{{0}}}", + # Bottom accents + "\u0330": "\\wideutilde{{{0}}}", + "\u0331": "\\underbar{{{0}}}", + "\u20e8": "\\threeunderdot{{{0}}}", + "\u20ec": "\\underrightharpoondown{{{0}}}", + "\u20ed": "\\underleftharpoondown{{{0}}}", + "\u20ee": "\\underledtarrow{{{0}}}", + "\u20ef": "\\underrightarrow{{{0}}}", + # Over | group + "\u23b4": "\\overbracket{{{0}}}", + "\u23dc": "\\overparen{{{0}}}", + "\u23de": "\\overbrace{{{0}}}", + # Under| group + "\u23b5": "\\underbracket{{{0}}}", + "\u23dd": "\\underparen{{{0}}}", + "\u23df": "\\underbrace{{{0}}}", } CHR_BO = { - #Big operators, - '\u2140' : '\\Bbbsum', - '\u220f' : '\\prod', - '\u2210' : '\\coprod', - '\u2211' : '\\sum', - '\u222b' : '\\int', - '\u22c0' : '\\bigwedge', - '\u22c1' : '\\bigvee', - '\u22c2' : '\\bigcap', - '\u22c3' : '\\bigcup', - '\u2a00' : '\\bigodot', - '\u2a01' : '\\bigoplus', - '\u2a02' : '\\bigotimes', + # Big operators, + "\u2140": "\\Bbbsum", + "\u220f": "\\prod", + "\u2210": "\\coprod", + "\u2211": "\\sum", + "\u222b": "\\int", + "\u22c0": "\\bigwedge", + "\u22c1": "\\bigvee", + "\u22c2": "\\bigcap", + "\u22c3": "\\bigcup", + "\u2a00": "\\bigodot", + "\u2a01": "\\bigoplus", + "\u2a02": "\\bigotimes", } T = { - - '\u2192' : '\\rightarrow ', - #Greek letters - '\U0001d6fc' : '\\alpha ', - '\U0001d6fd' : '\\beta ', - '\U0001d6fe' : '\\gamma ', - '\U0001d6ff' : '\\theta ', - '\U0001d700' : '\\epsilon ', - '\U0001d701' : '\\zeta ', - '\U0001d702' : '\\eta ', - '\U0001d703' : '\\theta ', - '\U0001d704' : '\\iota ', - '\U0001d705' : '\\kappa ', - '\U0001d706' : '\\lambda ', - '\U0001d707' : '\\m ', - '\U0001d708' : '\\n ', - '\U0001d709' : '\\xi ', - '\U0001d70a' : '\\omicron ', - '\U0001d70b' : '\\pi ', - '\U0001d70c' : '\\rho ', - '\U0001d70d' : '\\varsigma ', - '\U0001d70e' : '\\sigma ', - '\U0001d70f' : '\\ta ', - '\U0001d710' : '\\upsilon ', - '\U0001d711' : '\\phi ', - '\U0001d712' : '\\chi ', - '\U0001d713' : '\\psi ', - '\U0001d714' : '\\omega ', - '\U0001d715' : '\\partial ', - '\U0001d716' : '\\varepsilon ', - '\U0001d717' : '\\vartheta ', - '\U0001d718' : '\\varkappa ', - '\U0001d719' : '\\varphi ', - '\U0001d71a' : '\\varrho ', - '\U0001d71b' : '\\varpi ', - #Relation symbols - '\u2190' : '\\leftarrow ', - '\u2191' : '\\uparrow ', - '\u2192' : '\\rightarrow ', - '\u2193' : '\\downright ', - '\u2194' : '\\leftrightarrow ', - '\u2195' : '\\updownarrow ', - '\u2196' : '\\nwarrow ', - '\u2197' : '\\nearrow ', - '\u2198' : '\\searrow ', - '\u2199' : '\\swarrow ', - '\u22ee' : '\\vdots ', - '\u22ef' : '\\cdots ', - '\u22f0' : '\\adots ', - '\u22f1' : '\\ddots ', - '\u2260' : '\\ne ', - '\u2264' : '\\leq ', - '\u2265' : '\\geq ', - '\u2266' : '\\leqq ', - '\u2267' : '\\geqq ', - '\u2268' : '\\lneqq ', - '\u2269' : '\\gneqq ', - '\u226a' : '\\ll ', - '\u226b' : '\\gg ', - '\u2208' : '\\in ', - '\u2209' : '\\notin ', - '\u220b' : '\\ni ', - '\u220c' : '\\nni ', - - #Ordinary symbols - '\u221e' : '\\infty ', - #Binary relations - '\u00b1' : '\\pm ', - '\u2213' : '\\mp ', - #Italic, Latin, uppercase - '\U0001d434' : 'A', - '\U0001d435' : 'B', - '\U0001d436' : 'C', - '\U0001d437' : 'D', - '\U0001d438' : 'E', - '\U0001d439' : 'F', - '\U0001d43a' : 'G', - '\U0001d43b' : 'H', - '\U0001d43c' : 'I', - '\U0001d43d' : 'J', - '\U0001d43e' : 'K', - '\U0001d43f' : 'L', - '\U0001d440' : 'M', - '\U0001d441' : 'N', - '\U0001d442' : 'O', - '\U0001d443' : 'P', - '\U0001d444' : 'Q', - '\U0001d445' : 'R', - '\U0001d446' : 'S', - '\U0001d447' : 'T', - '\U0001d448' : 'U', - '\U0001d449' : 'V', - '\U0001d44a' : 'W', - '\U0001d44b' : 'X', - '\U0001d44c' : 'Y', - '\U0001d44d' : 'Z', - #Italic, Latin, lowercase - '\U0001d44e' : 'a', - '\U0001d44f' : 'b', - '\U0001d450' : 'c', - '\U0001d451' : 'd', - '\U0001d452' : 'e', - '\U0001d453' : 'f', - '\U0001d454' : 'g', - '\U0001d456' : 'i', - '\U0001d457' : 'j', - '\U0001d458' : 'k', - '\U0001d459' : 'l', - '\U0001d45a' : 'm', - '\U0001d45b' : 'n', - '\U0001d45c' : 'o', - '\U0001d45d' : 'p', - '\U0001d45e' : 'q', - '\U0001d45f' : 'r', - '\U0001d460' : 's', - '\U0001d461' : 't', - '\U0001d462' : 'u', - '\U0001d463' : 'v', - '\U0001d464' : 'w', - '\U0001d465' : 'x', - '\U0001d466' : 'y', - '\U0001d467' : 'z', + "\u2192": "\\rightarrow ", + # Greek letters + "\U0001d6fc": "\\alpha ", + "\U0001d6fd": "\\beta ", + "\U0001d6fe": "\\gamma ", + "\U0001d6ff": "\\theta ", + "\U0001d700": "\\epsilon ", + "\U0001d701": "\\zeta ", + "\U0001d702": "\\eta ", + "\U0001d703": "\\theta ", + "\U0001d704": "\\iota ", + "\U0001d705": "\\kappa ", + "\U0001d706": "\\lambda ", + "\U0001d707": "\\m ", + "\U0001d708": "\\n ", + "\U0001d709": "\\xi ", + "\U0001d70a": "\\omicron ", + "\U0001d70b": "\\pi ", + "\U0001d70c": "\\rho ", + "\U0001d70d": "\\varsigma ", + "\U0001d70e": "\\sigma ", + "\U0001d70f": "\\ta ", + "\U0001d710": "\\upsilon ", + "\U0001d711": "\\phi ", + "\U0001d712": "\\chi ", + "\U0001d713": "\\psi ", + "\U0001d714": "\\omega ", + "\U0001d715": "\\partial ", + "\U0001d716": "\\varepsilon ", + "\U0001d717": "\\vartheta ", + "\U0001d718": "\\varkappa ", + "\U0001d719": "\\varphi ", + "\U0001d71a": "\\varrho ", + "\U0001d71b": "\\varpi ", + # Relation symbols + "\u2190": "\\leftarrow ", + "\u2191": "\\uparrow ", + "\u2192": "\\rightarrow ", + "\u2193": "\\downright ", + "\u2194": "\\leftrightarrow ", + "\u2195": "\\updownarrow ", + "\u2196": "\\nwarrow ", + "\u2197": "\\nearrow ", + "\u2198": "\\searrow ", + "\u2199": "\\swarrow ", + "\u22ee": "\\vdots ", + "\u22ef": "\\cdots ", + "\u22f0": "\\adots ", + "\u22f1": "\\ddots ", + "\u2260": "\\ne ", + "\u2264": "\\leq ", + "\u2265": "\\geq ", + "\u2266": "\\leqq ", + "\u2267": "\\geqq ", + "\u2268": "\\lneqq ", + "\u2269": "\\gneqq ", + "\u226a": "\\ll ", + "\u226b": "\\gg ", + "\u2208": "\\in ", + "\u2209": "\\notin ", + "\u220b": "\\ni ", + "\u220c": "\\nni ", + # Ordinary symbols + "\u221e": "\\infty ", + # Binary relations + "\u00b1": "\\pm ", + "\u2213": "\\mp ", + # Italic, Latin, uppercase + "\U0001d434": "A", + "\U0001d435": "B", + "\U0001d436": "C", + "\U0001d437": "D", + "\U0001d438": "E", + "\U0001d439": "F", + "\U0001d43a": "G", + "\U0001d43b": "H", + "\U0001d43c": "I", + "\U0001d43d": "J", + "\U0001d43e": "K", + "\U0001d43f": "L", + "\U0001d440": "M", + "\U0001d441": "N", + "\U0001d442": "O", + "\U0001d443": "P", + "\U0001d444": "Q", + "\U0001d445": "R", + "\U0001d446": "S", + "\U0001d447": "T", + "\U0001d448": "U", + "\U0001d449": "V", + "\U0001d44a": "W", + "\U0001d44b": "X", + "\U0001d44c": "Y", + "\U0001d44d": "Z", + # Italic, Latin, lowercase + "\U0001d44e": "a", + "\U0001d44f": "b", + "\U0001d450": "c", + "\U0001d451": "d", + "\U0001d452": "e", + "\U0001d453": "f", + "\U0001d454": "g", + "\U0001d456": "i", + "\U0001d457": "j", + "\U0001d458": "k", + "\U0001d459": "l", + "\U0001d45a": "m", + "\U0001d45b": "n", + "\U0001d45c": "o", + "\U0001d45d": "p", + "\U0001d45e": "q", + "\U0001d45f": "r", + "\U0001d460": "s", + "\U0001d461": "t", + "\U0001d462": "u", + "\U0001d463": "v", + "\U0001d464": "w", + "\U0001d465": "x", + "\U0001d466": "y", + "\U0001d467": "z", } -FUNC ={ - 'sin' : '\\sin({fe})', - 'cos' : '\\cos({fe})', - 'tan' : '\\tan({fe})', - 'arcsin' : '\\arcsin({fe})', - 'arccos' : '\\arccos({fe})', - 'arctan' : '\\arctan({fe})', - 'arccot' : '\\arccot({fe})', - 'sinh' : '\\sinh({fe})', - 'cosh' : '\\cosh({fe})', - 'tanh' : '\\tanh({fe})', - 'coth' : '\\coth({fe})', - 'sec' : '\\sec({fe})', - 'csc' : '\\csc({fe})', +FUNC = { + "sin": "\\sin({fe})", + "cos": "\\cos({fe})", + "tan": "\\tan({fe})", + "arcsin": "\\arcsin({fe})", + "arccos": "\\arccos({fe})", + "arctan": "\\arctan({fe})", + "arccot": "\\arccot({fe})", + "sinh": "\\sinh({fe})", + "cosh": "\\cosh({fe})", + "tanh": "\\tanh({fe})", + "coth": "\\coth({fe})", + "sec": "\\sec({fe})", + "csc": "\\csc({fe})", } -FUNC_PLACE = '{fe}' +FUNC_PLACE = "{fe}" -BRK = '\\\\' +BRK = "\\\\" CHR_DEFAULT = { - 'ACC_VAL':'\\hat{{{0}}}', + "ACC_VAL": "\\hat{{{0}}}", } POS = { - 'top' : '\\overline{{{0}}}', # not sure - 'bot' : '\\underline{{{0}}}', + "top": "\\overline{{{0}}}", # not sure + "bot": "\\underline{{{0}}}", } POS_DEFAULT = { - 'BAR_VAL': '\\overline{{{0}}}', + "BAR_VAL": "\\overline{{{0}}}", } -SUB = '_{{{0}}}' +SUB = "_{{{0}}}" -SUP = '^{{{0}}}' +SUP = "^{{{0}}}" F = { - 'bar': '\\frac{{{num}}}{{{den}}}', - 'skw': r'^{{{num}}}/_{{{den}}}', - 'noBar': '\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}', - 'lin' : '{{{num}}}/{{{den}}}', + "bar": "\\frac{{{num}}}{{{den}}}", + "skw": r"^{{{num}}}/_{{{den}}}", + "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", + "lin": "{{{num}}}/{{{den}}}", } -F_DEFAULT = '\\frac{{{num}}}{{{den}}}' +F_DEFAULT = "\\frac{{{num}}}{{{den}}}" -D = '\\left{left}{text}\\right{right}' +D = "\\left{left}{text}\\right{right}" D_DEFAULT = { - 'left':'(', - 'right':')', - 'null':'.', + "left": "(", + "right": ")", + "null": ".", } -RAD = '\\sqrt[{deg}]{{{text}}}' +RAD = "\\sqrt[{deg}]{{{text}}}" -RAD_DEFAULT = '\\sqrt{{{text}}}' +RAD_DEFAULT = "\\sqrt{{{text}}}" -ARR = '\\begin{{array}}{{c}}{text}\end{{array}}' +ARR = "\\begin{{array}}{{c}}{text}\end{{array}}" LIM_FUNC = { - 'lim':'\\lim_{{{lim}}}', - 'max':'\\max_{{{lim}}}', - 'min':'\\min_{{{lim}}}', + "lim": "\\lim_{{{lim}}}", + "max": "\\max_{{{lim}}}", + "min": "\\min_{{{lim}}}", } -LIM_TO = ('\\rightarrow','\\to') +LIM_TO = ("\\rightarrow", "\\to") -LIM_UPP = '\\overset{{{lim}}}{{{text}}}' +LIM_UPP = "\\overset{{{lim}}}{{{text}}}" -M = '\\begin{{matrix}}{text}\end{{matrix}}' +M = "\\begin{{matrix}}{text}\end{{matrix}}" diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py index c258d3b..6e6ccbd 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -8,355 +8,395 @@ On 25/03/2025 import xml.etree.ElementTree as ET -from .latex_dict import (CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT - , SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR - , LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE) +from .latex_dict import ( + CHARS, + CHR, + CHR_BO, + CHR_DEFAULT, + POS, + POS_DEFAULT, + SUB, + SUP, + F, + F_DEFAULT, + T, + FUNC, + D, + D_DEFAULT, + RAD, + RAD_DEFAULT, + ARR, + LIM_FUNC, + LIM_TO, + LIM_UPP, + M, + BRK, + BLANK, + BACKSLASH, + ALN, + FUNC_PLACE, +) OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" def load(stream): - tree = ET.parse(stream) - for omath in tree.findall(OMML_NS+'oMath'): - yield oMath2Latex(omath) + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + def load_string(string): - root = ET.fromstring(string) - for omath in root.findall(OMML_NS+'oMath'): - yield oMath2Latex(omath) + root = ET.fromstring(string) + for omath in root.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + def escape_latex(strs): - last = None - new_chr = [] - strs = strs.replace(r'\\','\\') - for c in strs : - if (c in CHARS) and (last !=BACKSLASH): - new_chr.append(BACKSLASH+c) - else: - new_chr.append(c) - last = c - return BLANK.join(new_chr) + last = None + new_chr = [] + strs = strs.replace(r"\\", "\\") + for c in strs: + if (c in CHARS) and (last != BACKSLASH): + new_chr.append(BACKSLASH + c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) -def get_val(key,default=None,store=CHR): - if key is not None: - return key if not store else store.get(key,key) - else: - return default + +def get_val(key, default=None, store=CHR): + if key is not None: + return key if not store else store.get(key, key) + else: + return default class Tag2Method(object): - def call_method(self,elm,stag=None): - getmethod = self.tag2meth.get - if stag is None: - stag = elm.tag.replace(OMML_NS,'') - method = getmethod(stag) - if method: - return method(self,elm) - else: - return None + def call_method(self, elm, stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS, "") + method = getmethod(stag) + if method: + return method(self, elm) + else: + return None - def process_children_list(self,elm,include=None): - """ - process children of the elm,return iterable - """ - for _e in list(elm): - if (OMML_NS not in _e.tag): - continue - stag = _e.tag.replace(OMML_NS,'') - if include and (stag not in include): - continue - t = self.call_method(_e,stag=stag) - if t is None: - t = self.process_unknow(_e,stag) - if t is None: - continue - yield (stag,t,_e) + def process_children_list(self, elm, include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if OMML_NS not in _e.tag: + continue + stag = _e.tag.replace(OMML_NS, "") + if include and (stag not in include): + continue + t = self.call_method(_e, stag=stag) + if t is None: + t = self.process_unknow(_e, stag) + if t is None: + continue + yield (stag, t, _e) - def process_children_dict(self,elm,include=None): - """ - process children of the elm,return dict - """ - latex_chars = dict() - for stag,t,e in self.process_children_list(elm,include): - latex_chars[stag] = t - return latex_chars + def process_children_dict(self, elm, include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag, t, e in self.process_children_list(elm, include): + latex_chars[stag] = t + return latex_chars - def process_children(self,elm,include=None): - """ - process children of the elm,return string - """ - return BLANK.join(( t if not isinstance(t,Tag2Method) else str(t) - for stag,t,e in self.process_children_list(elm,include))) + def process_children(self, elm, include=None): + """ + process children of the elm,return string + """ + return BLANK.join( + ( + t if not isinstance(t, Tag2Method) else str(t) + for stag, t, e in self.process_children_list(elm, include) + ) + ) - def process_unknow(self,elm,stag): - return None + def process_unknow(self, elm, stag): + return None class Pr(Tag2Method): - text = '' + text = "" - __val_tags = ('chr','pos','begChr','endChr','type') + __val_tags = ("chr", "pos", "begChr", "endChr", "type") - __innerdict= None #can't use the __dict__ + __innerdict = None # can't use the __dict__ - """ common properties of element""" - def __init__(self, elm): - self.__innerdict={} - self.text=self.process_children(elm) + """ common properties of element""" - def __str__(self): - return self.text + def __init__(self, elm): + self.__innerdict = {} + self.text = self.process_children(elm) - def __unicode__(self): - return self.__str__(self) + def __str__(self): + return self.text - def __getattr__(self,name): - return self.__innerdict.get(name,None) + def __unicode__(self): + return self.__str__(self) - def do_brk(self,elm): - self.__innerdict['brk'] = BRK - return BRK + def __getattr__(self, name): + return self.__innerdict.get(name, None) - def do_common(self,elm): - stag = elm.tag.replace(OMML_NS,'') - if stag in self.__val_tags: - t = elm.get('{0}val'.format(OMML_NS)) - self.__innerdict[stag] = t - return None + def do_brk(self, elm): + self.__innerdict["brk"] = BRK + return BRK - tag2meth = { - 'brk':do_brk, - 'chr':do_common, - 'pos':do_common, - 'begChr':do_common, - 'endChr':do_common, - 'type':do_common, - } + def do_common(self, elm): + stag = elm.tag.replace(OMML_NS, "") + if stag in self.__val_tags: + t = elm.get("{0}val".format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + "brk": do_brk, + "chr": do_common, + "pos": do_common, + "begChr": do_common, + "endChr": do_common, + "type": do_common, + } class oMath2Latex(Tag2Method): - """ - Convert oMath element of omml to latex - """ - _t_dict = T + """ + Convert oMath element of omml to latex + """ - __direct_tags = ('box','sSub','sSup','sSubSup','num','den','deg','e') + _t_dict = T - def __init__(self, element): - self._latex = self.process_children(element) + __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") - def __str__(self): - return self.latex + def __init__(self, element): + self._latex = self.process_children(element) - def __unicode__(self): - return self.__str__(self) + def __str__(self): + return self.latex - def process_unknow(self,elm,stag): - if stag in self.__direct_tags: - return self.process_children(elm) - elif stag[-2:] == 'Pr': - return Pr(elm) - else: - return None + def __unicode__(self): + return self.__str__(self) - @property - def latex(self): - return self._latex + def process_unknow(self, elm, stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == "Pr": + return Pr(elm) + else: + return None - def do_acc(self,elm): - """ - the accent function - """ - c_dict = self.process_children_dict(elm) - latex_s = get_val(c_dict['accPr'].chr,default=CHR_DEFAULT.get('ACC_VAL'),store=CHR) - return latex_s.format(c_dict['e']) + @property + def latex(self): + return self._latex - def do_bar(self,elm): - """ - the bar function - """ - c_dict = self.process_children_dict(elm) - pr = c_dict['barPr'] - latex_s = get_val(pr.pos,default=POS_DEFAULT.get('BAR_VAL'),store=POS) - return pr.text+latex_s.format(c_dict['e']) + def do_acc(self, elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val( + c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR + ) + return latex_s.format(c_dict["e"]) - def do_d(self,elm): - """ - the delimiter object - """ - c_dict = self.process_children_dict(elm) - pr = c_dict['dPr'] - null = D_DEFAULT.get('null') - s_val = get_val(pr.begChr,default=D_DEFAULT.get('left'),store=T) - e_val = get_val(pr.endChr,default=D_DEFAULT.get('right'),store=T) - return pr.text+D.format(left= null if not s_val else escape_latex(s_val), - text=c_dict['e'], - right= null if not e_val else escape_latex(e_val)) + def do_bar(self, elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["barPr"] + latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) + return pr.text + latex_s.format(c_dict["e"]) + def do_d(self, elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["dPr"] + null = D_DEFAULT.get("null") + s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) + e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) + return pr.text + D.format( + left=null if not s_val else escape_latex(s_val), + text=c_dict["e"], + right=null if not e_val else escape_latex(e_val), + ) - def do_spre(self,elm): - """ - the Pre-Sub-Superscript object -- Not support yet - """ - pass + def do_spre(self, elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass - def do_sub(self,elm): - text = self.process_children(elm) - return SUB.format(text) + def do_sub(self, elm): + text = self.process_children(elm) + return SUB.format(text) - def do_sup(self,elm): - text = self.process_children(elm) - return SUP.format(text) + def do_sup(self, elm): + text = self.process_children(elm) + return SUP.format(text) - def do_f(self,elm): - """ - the fraction object - """ - c_dict = self.process_children_dict(elm) - pr = c_dict['fPr'] - latex_s = get_val(pr.type,default=F_DEFAULT,store=F) - return pr.text+latex_s.format(num=c_dict.get('num'),den=c_dict.get('den')) + def do_f(self, elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["fPr"] + latex_s = get_val(pr.type, default=F_DEFAULT, store=F) + return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) - def do_func(self,elm): - """ - the Function-Apply object (Examples:sin cos) - """ - c_dict = self.process_children_dict(elm) - func_name = c_dict.get('fName') - return func_name.replace(FUNC_PLACE,c_dict.get('e')) + def do_func(self, elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get("fName") + return func_name.replace(FUNC_PLACE, c_dict.get("e")) - def do_fname(self,elm): - """ - the func name - """ - latex_chars = [] - for stag,t,e in self.process_children_list(elm): - if stag == 'r': - if FUNC.get(t): - latex_chars.append(FUNC[t]) - else : - raise NotImplemented("Not support func %s" % t) - else: - latex_chars.append(t) - t = BLANK.join(latex_chars) - return t if FUNC_PLACE in t else t+FUNC_PLACE #do_func will replace this + def do_fname(self, elm): + """ + the func name + """ + latex_chars = [] + for stag, t, e in self.process_children_list(elm): + if stag == "r": + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else: + raise NotImplemented("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this - def do_groupchr(self,elm): - """ - the Group-Character object - """ - c_dict = self.process_children_dict(elm) - pr = c_dict['groupChrPr'] - latex_s = get_val(pr.chr) - return pr.text+latex_s.format(c_dict['e']) + def do_groupchr(self, elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["groupChrPr"] + latex_s = get_val(pr.chr) + return pr.text + latex_s.format(c_dict["e"]) - def do_rad(self,elm): - """ - the radical object - """ - c_dict = self.process_children_dict(elm) - text = c_dict.get('e') - deg_text = c_dict.get('deg') - if deg_text: - return RAD.format(deg=deg_text,text=text) - else: - return RAD_DEFAULT.format(text=text) - - def do_eqarr(self,elm): - """ - the Array object - """ - return ARR.format(text=BRK.join( - [t for stag,t,e in self.process_children_list(elm,include=('e',))])) + def do_rad(self, elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get("e") + deg_text = c_dict.get("deg") + if deg_text: + return RAD.format(deg=deg_text, text=text) + else: + return RAD_DEFAULT.format(text=text) + def do_eqarr(self, elm): + """ + the Array object + """ + return ARR.format( + text=BRK.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + ) - def do_limlow(self,elm): - """ - the Lower-Limit object - """ - t_dict = self.process_children_dict(elm,include=('e','lim')) - latex_s = LIM_FUNC.get(t_dict['e']) - if not latex_s : - raise NotImplemented("Not support lim %s" % t_dict['e']) - else: - return latex_s.format(lim=t_dict.get('lim')) + def do_limlow(self, elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + latex_s = LIM_FUNC.get(t_dict["e"]) + if not latex_s: + raise NotImplemented("Not support lim %s" % t_dict["e"]) + else: + return latex_s.format(lim=t_dict.get("lim")) - def do_limupp(self,elm): - """ - the Upper-Limit object - """ - t_dict = self.process_children_dict(elm,include=('e','lim')) - return LIM_UPP.format(lim=t_dict.get('lim'),text=t_dict.get('e')) + def do_limupp(self, elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) - def do_lim(self,elm): - """ - the lower limit of the limLow object and the upper limit of the limUpp function - """ - return self.process_children(elm).replace(LIM_TO[0],LIM_TO[1]) - - def do_m(self,elm): - """ - the Matrix object - """ - rows = [] - for stag,t,e in self.process_children_list(elm): - if stag is 'mPr': - pass - elif stag == 'mr': - rows.append(t) - return M.format(text=BRK.join(rows)) + def do_lim(self, elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) - def do_mr(self,elm): - """ - a single row of the matrix m - """ - return ALN.join( - [t for stag,t,e in self.process_children_list(elm,include=('e',))]) + def do_m(self, elm): + """ + the Matrix object + """ + rows = [] + for stag, t, e in self.process_children_list(elm): + if stag is "mPr": + pass + elif stag == "mr": + rows.append(t) + return M.format(text=BRK.join(rows)) - def do_nary(self,elm): - """ - the n-ary object - """ - res = [] - bo = '' - for stag,t,e in self.process_children_list(elm): - if stag == 'naryPr': - bo = get_val(t.chr,store=CHR_BO) - else : - res.append(t) - return bo+BLANK.join(res) + def do_mr(self, elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) - def do_r(self,elm): - """ - Get text from 'r' element,And try convert them to latex symbols - @todo text style support , (sty) - @todo \text (latex pure text support) - """ - _str = [] - for s in elm.findtext('./{0}t'.format(OMML_NS)): - #s = s if isinstance(s,unicode) else unicode(s,'utf-8') - _str.append(self._t_dict.get(s,s)) - return escape_latex(BLANK.join(_str)) + def do_nary(self, elm): + """ + the n-ary object + """ + res = [] + bo = "" + for stag, t, e in self.process_children_list(elm): + if stag == "naryPr": + bo = get_val(t.chr, store=CHR_BO) + else: + res.append(t) + return bo + BLANK.join(res) - tag2meth={ - 'acc' : do_acc, - 'r' : do_r, - 'bar' : do_bar, - 'sub' : do_sub, - 'sup' : do_sup, - 'f' : do_f, - 'func': do_func, - 'fName' : do_fname, - 'groupChr' : do_groupchr, - 'd' : do_d, - 'rad' : do_rad, - 'eqArr' : do_eqarr, - 'limLow' : do_limlow, - 'limUpp' : do_limupp, - 'lim' : do_lim, - 'm' : do_m, - 'mr' : do_mr, - 'nary' : do_nary, - } + def do_r(self, elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + for s in elm.findtext("./{0}t".format(OMML_NS)): + # s = s if isinstance(s,unicode) else unicode(s,'utf-8') + _str.append(self._t_dict.get(s, s)) + return escape_latex(BLANK.join(_str)) + + tag2meth = { + "acc": do_acc, + "r": do_r, + "bar": do_bar, + "sub": do_sub, + "sup": do_sup, + "f": do_f, + "func": do_func, + "fName": do_fname, + "groupChr": do_groupchr, + "d": do_d, + "rad": do_rad, + "eqArr": do_eqarr, + "limLow": do_limlow, + "limUpp": do_limupp, + "lim": do_lim, + "m": do_m, + "mr": do_mr, + "nary": do_nary, + } diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 6b30094..b320695 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -75,5 +75,6 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + **kwargs, ) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 5e9ced5..1819183 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -272,9 +272,10 @@ def test_docx_equations() -> None: assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" # Find block equations wrapped with double $$ and check if they are present - block_equations = re.findall(r'\$\$(.+?)\$\$', result.text_content) + block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) assert block_equations, "No block equations found in the document." + def test_input_as_strings() -> None: markitdown = MarkItDown()