diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index e68b099..bf45d6d 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -8,11 +8,13 @@ import mimetypes import os import re import shutil +import string import subprocess import sys import tempfile import traceback import zipfile +from io import BytesIO from xml.dom import minidom from typing import Any, Dict, List, Optional, Union from pathlib import Path @@ -32,6 +34,7 @@ import puremagic import requests from bs4 import BeautifulSoup from charset_normalizer import from_path +from lxml import etree as ET # Azure imports from azure.ai.documentintelligence import DocumentIntelligenceClient @@ -720,6 +723,68 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ + def __init__(self): + self._omath_re = re.compile(r"]*>.+?", flags=re.S) + self._omath_para_re = re.compile( + r"(.+?)", flags=re.S + ) + self._formula_re = re.compile(r"\$formula\$(.+?)\$/formula\$") + + self.nsmap = { + "m": "http://schemas.openxmlformats.org/officeDocument/2006/math", + "o": "urn:schemas-microsoft-com:office:office", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "v": "urn:schemas-microsoft-com:vml", + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "cx": "http://schemas.microsoft.com/office/drawing/2014/chartex", + "cx1": "http://schemas.microsoft.com/office/drawing/2015/9/8/chartex", + "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", + "w10": "urn:schemas-microsoft-com:office:word", + "w14": "http://schemas.microsoft.com/office/word/2010/wordml", + "w15": "http://schemas.microsoft.com/office/word/2012/wordml", + "w16se": "http://schemas.microsoft.com/office/word/2015/wordml/symex", + "wpc": "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas", + "wne": "http://schemas.microsoft.com/office/word/2006/wordml", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "wp14": "http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing", + "wpg": "http://schemas.microsoft.com/office/word/2010/wordprocessingGroup", + "wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk", + "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", + } + self._xmlns_str = " ".join( + 'xmlns:{}="{}"'.format(key, value) for key, value in self.nsmap.items() + ) + self._template = string.Template( + """ + + $formula_xml + """.format( + self._xmlns_str + ) + ) + self._xsl_folder = os.path.join( + os.path.dirname( + os.path.dirname(os.path.abspath(__file__))), + 'xsl', + ) + self._mml_to_tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl") + self._omml_to_mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl") + + def _mml_to_tex(self, mml_xml: str) -> str: + tree = ET.fromstring(mml_xml) + transform = ET.XSLT(ET.parse(self._mml_to_tex_xsl)) + return str(transform(tree)) + + def _omml_to_mml(self, formula_xml: str) -> str: + xml_content = self._template.safe_substitute(formula_xml=formula_xml) + tree = ET.fromstring(xml_content) + transform = ET.XSLT(ET.parse(self._omml_to_mml_xsl)) + return str(transform(tree)) + + def _omml_to_tex(self, omml_xml: str) -> str: + mml_xml = self._omml_to_mml(omml_xml) + return self._mml_to_tex(mml_xml) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") @@ -727,15 +792,53 @@ class DocxConverter(HtmlConverter): return None result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) + # preprocess docx equations in docx file + docx_file = self._quote_equations(local_path) + style_map = kwargs.get("style_map", None) - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) + result = mammoth.convert_to_html(docx_file, style_map=style_map) + html_content = self._unquote_omath_to_tex(result.value) + result = self._convert(html_content) return result + def _quote_omath(self, xml_content: str) -> str: + def replace(match): + quoted_omath = quote(match.group(0)) + return "$formula$ {} $/formula$".format(quoted_omath) + + xml_content = self._omath_re.sub(replace, xml_content) + xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content) + return xml_content + + def _unquote_omath_to_tex(self, html: str) -> str: + def replace(match): + omml_content = unquote(match.group(1)) + return self._omml_to_tex(omml_content) + + return self._formula_re.sub(replace, html) + + def _quote_equations(self, docx_filename: str) -> BytesIO: + """ + Surrounds all OMML equations in the docx file with $formula$ and + $/formula$ tags. + """ + doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml") + output_zip = BytesIO() + with zipfile.ZipFile(docx_filename, "r") as z_in: + with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as z_out: + z_out.comment = z_in.comment + for item in z_in.infolist(): + if item.filename not in doc_files: + z_out.writestr(item, z_in.read(item.filename)) + else: + xml_content = self._quote_omath( + z_in.read(item.filename).decode("utf8") + ).encode("utf8") + z_out.writestr(item.filename, xml_content) + output_zip.seek(0) + return output_zip + class XlsxConverter(HtmlConverter): """ diff --git a/tests/test_files/equation.docx b/tests/test_files/equation.docx new file mode 100644 index 0000000..e61721a Binary files /dev/null and b/tests/test_files/equation.docx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 689d6f3..300ced3 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -272,6 +272,10 @@ def test_markitdown_local() -> None: assert "# Test" in result.text_content + + + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", @@ -358,6 +362,12 @@ def test_markitdown_llm() -> None: for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() +def test_equation() -> None: + expected_string = r'${\left(x+a\right)}^{n}={\sum }\_{k=0}^{n}\left(\begin{array}{c}n\\ k\end{array}\right){x}^{k}{a}^{n-k}$' + markitdown = MarkItDown() + result = markitdown.convert( os.path.join(TEST_FILES_DIR, "equation.docx")) + assert expected_string == result.text_content.strip() + if __name__ == "__main__": """Runs this file's tests from the command line."""