Merge 238ff216dc into 73ba69d8cd
This commit is contained in:
commit
90367ddec6
3 changed files with 118 additions and 5 deletions
|
|
@ -8,11 +8,13 @@ import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import string
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from io import BytesIO
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -32,6 +34,7 @@ import puremagic
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from charset_normalizer import from_path
|
from charset_normalizer import from_path
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
# Azure imports
|
# Azure imports
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
|
|
@ -720,6 +723,68 @@ class DocxConverter(HtmlConverter):
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._omath_re = re.compile(r"<m:oMath[^<>]*>.+?</m:oMath>", flags=re.S)
|
||||||
|
self._omath_para_re = re.compile(
|
||||||
|
r"<m:oMathPara\s*>(.+?)</m:oMathPara>", flags=re.S
|
||||||
|
)
|
||||||
|
self._formula_re = re.compile(r"\$formula\$(.+?)\$/formula\$")
|
||||||
|
|
||||||
|
self.nsmap = {
|
||||||
|
"m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
|
||||||
|
"o": "urn:schemas-microsoft-com:office:office",
|
||||||
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||||||
|
"v": "urn:schemas-microsoft-com:vml",
|
||||||
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||||
|
"cx": "http://schemas.microsoft.com/office/drawing/2014/chartex",
|
||||||
|
"cx1": "http://schemas.microsoft.com/office/drawing/2015/9/8/chartex",
|
||||||
|
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
|
||||||
|
"w10": "urn:schemas-microsoft-com:office:word",
|
||||||
|
"w14": "http://schemas.microsoft.com/office/word/2010/wordml",
|
||||||
|
"w15": "http://schemas.microsoft.com/office/word/2012/wordml",
|
||||||
|
"w16se": "http://schemas.microsoft.com/office/word/2015/wordml/symex",
|
||||||
|
"wpc": "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas",
|
||||||
|
"wne": "http://schemas.microsoft.com/office/word/2006/wordml",
|
||||||
|
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||||||
|
"wp14": "http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing",
|
||||||
|
"wpg": "http://schemas.microsoft.com/office/word/2010/wordprocessingGroup",
|
||||||
|
"wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk",
|
||||||
|
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
||||||
|
}
|
||||||
|
self._xmlns_str = " ".join(
|
||||||
|
'xmlns:{}="{}"'.format(key, value) for key, value in self.nsmap.items()
|
||||||
|
)
|
||||||
|
self._template = string.Template(
|
||||||
|
"""<?xml version="1.0" standalone="yes"?>
|
||||||
|
<w:document mc:Ignorable="w14 w15 w16se wp14" {}>
|
||||||
|
$formula_xml
|
||||||
|
</w:document>""".format(
|
||||||
|
self._xmlns_str
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._xsl_folder = os.path.join(
|
||||||
|
os.path.dirname(
|
||||||
|
os.path.dirname(os.path.abspath(__file__))),
|
||||||
|
'xsl',
|
||||||
|
)
|
||||||
|
self._mml_to_tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl")
|
||||||
|
self._omml_to_mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl")
|
||||||
|
|
||||||
|
def _mml_to_tex(self, mml_xml: str) -> str:
|
||||||
|
tree = ET.fromstring(mml_xml)
|
||||||
|
transform = ET.XSLT(ET.parse(self._mml_to_tex_xsl))
|
||||||
|
return str(transform(tree))
|
||||||
|
|
||||||
|
def _omml_to_mml(self, formula_xml: str) -> str:
|
||||||
|
xml_content = self._template.safe_substitute(formula_xml=formula_xml)
|
||||||
|
tree = ET.fromstring(xml_content)
|
||||||
|
transform = ET.XSLT(ET.parse(self._omml_to_mml_xsl))
|
||||||
|
return str(transform(tree))
|
||||||
|
|
||||||
|
def _omml_to_tex(self, omml_xml: str) -> str:
|
||||||
|
mml_xml = self._omml_to_mml(omml_xml)
|
||||||
|
return self._mml_to_tex(mml_xml)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|
@ -727,15 +792,53 @@ class DocxConverter(HtmlConverter):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rb") as docx_file:
|
# preprocess docx equations in docx file
|
||||||
style_map = kwargs.get("style_map", None)
|
docx_file = self._quote_equations(local_path)
|
||||||
|
style_map = kwargs.get("style_map", None)
|
||||||
|
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
||||||
html_content = result.value
|
html_content = self._unquote_omath_to_tex(result.value)
|
||||||
result = self._convert(html_content)
|
result = self._convert(html_content)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _quote_omath(self, xml_content: str) -> str:
|
||||||
|
def replace(match):
|
||||||
|
quoted_omath = quote(match.group(0))
|
||||||
|
return "<w:t>$formula$ {} $/formula$</w:t>".format(quoted_omath)
|
||||||
|
|
||||||
|
xml_content = self._omath_re.sub(replace, xml_content)
|
||||||
|
xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content)
|
||||||
|
return xml_content
|
||||||
|
|
||||||
|
def _unquote_omath_to_tex(self, html: str) -> str:
|
||||||
|
def replace(match):
|
||||||
|
omml_content = unquote(match.group(1))
|
||||||
|
return self._omml_to_tex(omml_content)
|
||||||
|
|
||||||
|
return self._formula_re.sub(replace, html)
|
||||||
|
|
||||||
|
def _quote_equations(self, docx_filename: str) -> BytesIO:
|
||||||
|
"""
|
||||||
|
Surrounds all OMML equations in the docx file with $formula$ and
|
||||||
|
$/formula$ tags.
|
||||||
|
"""
|
||||||
|
doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml")
|
||||||
|
output_zip = BytesIO()
|
||||||
|
with zipfile.ZipFile(docx_filename, "r") as z_in:
|
||||||
|
with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as z_out:
|
||||||
|
z_out.comment = z_in.comment
|
||||||
|
for item in z_in.infolist():
|
||||||
|
if item.filename not in doc_files:
|
||||||
|
z_out.writestr(item, z_in.read(item.filename))
|
||||||
|
else:
|
||||||
|
xml_content = self._quote_omath(
|
||||||
|
z_in.read(item.filename).decode("utf8")
|
||||||
|
).encode("utf8")
|
||||||
|
z_out.writestr(item.filename, xml_content)
|
||||||
|
output_zip.seek(0)
|
||||||
|
return output_zip
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
BIN
tests/test_files/equation.docx
vendored
Normal file
BIN
tests/test_files/equation.docx
vendored
Normal file
Binary file not shown.
|
|
@ -272,6 +272,10 @@ def test_markitdown_local() -> None:
|
||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_exiftool,
|
skip_exiftool,
|
||||||
reason="do not run if exiftool is not installed",
|
reason="do not run if exiftool is not installed",
|
||||||
|
|
@ -358,6 +362,12 @@ def test_markitdown_llm() -> None:
|
||||||
for test_string in ["red", "circle", "blue", "square"]:
|
for test_string in ["red", "circle", "blue", "square"]:
|
||||||
assert test_string in result.text_content.lower()
|
assert test_string in result.text_content.lower()
|
||||||
|
|
||||||
|
def test_equation() -> None:
|
||||||
|
expected_string = r'${\left(x+a\right)}^{n}={\sum }\_{k=0}^{n}\left(\begin{array}{c}n\\ k\end{array}\right){x}^{k}{a}^{n-k}$'
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert( os.path.join(TEST_FILES_DIR, "equation.docx"))
|
||||||
|
assert expected_string == result.text_content.strip()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue