diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2554c70..9185b4e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -704,6 +704,13 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ + def __init__(self): + self._omath_re = re.compile(r"]*>.+?", flags=re.S) + self._omath_para_re = re.compile( + r"(.+?)", flags=re.S + ) + self._formula_re = re.compile(r"\$formula\$(.+?)\$/formula\$") + self.nsmap = { "m": "http://schemas.openxmlformats.org/officeDocument/2006/math", "o": "urn:schemas-microsoft-com:office:office", @@ -725,15 +732,34 @@ class DocxConverter(HtmlConverter): "wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk", "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", } + self._xmlns_str = " ".join( + 'xmlns:{}="{}"'.format(key, value) for key, value in self.nsmap.items() + ) + self._template = string.Template( + """ + + $formula_xml + """.format( + self._xmlns_str + ) + ) + self._xsl_folder = os.path.join( + os.path.dirname( + os.path.dirname(os.path.abspath(__file__))), + 'xsl', + ) + self._mml2tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") + self._omml2mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") + def _mml2tex(self, mml_xml: str) -> str: tree = ET.fromstring(mml_xml) - transform = ET.XSLT(ET.parse(self._mml2tex_xsl_filename)) + transform = ET.XSLT(ET.parse(self._mml2tex_xsl)) return str(transform(tree)) - def _omml2mml(self, omml_xml: str) -> str: - xml_content = self._template.safe_substitute(omml_xml=omml_xml) + def _omml2mml(self, formula_xml: str) -> str: + xml_content = self._template.safe_substitute(formula_xml=formula_xml) tree = ET.fromstring(xml_content) - transform = ET.XSLT(ET.parse(self._omml2mml_xsl_filename)) + transform = ET.XSLT(ET.parse(self._omml2mml_xsl)) return str(transform(tree)) def _omml2tex(self, omml_xml: str) -> str: @@ -760,10 +786,10 @@ class DocxConverter(HtmlConverter): def _encapsulate_omath(self, xml_content: str) -> str: def replace(match): quoted_omath = quote(match.group(0)) - return "$omml$ {} $/omml$".format(quoted_omath) + return "$formula$ {} $/formula$".format(quoted_omath) - xml_content = self._omath_pattern.sub(replace, xml_content) - xml_content = self._omath_para_pattern.sub(lambda m: m.group(1), xml_content) + xml_content = self._omath_re.sub(replace, xml_content) + xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content) return xml_content def _convert_omath_to_tex(self, html: str) -> str: @@ -771,16 +797,17 @@ class DocxConverter(HtmlConverter): omml_content = unquote(match.group(1)) return self._omml2tex(omml_content) - return self._omml_pattern.sub(replace, html) + return self._formula_re.sub(replace, html) def _encapsulate_equations(self, docx_filename: str) -> BytesIO: """ + Surrounds all OMML equations in the docx file with $formula$ and $/formula$ tags. Args: docx_filename: The path to the docx file to process. Returns: - docx file with OMML equations encapsulated in $omml$ and $/omml$ tags. + docx file with OMML equations encapsulated in $formula$ and $/formula$ tags. """ doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml") output_zip = BytesIO()