Merge branch 'main' into fix-docker

2025-02-03 00:22:17 -08:00 · 2025-02-03 00:22:17 -08:00 · 657d63e5d1
commit 657d63e5d1
parent 310ba02dcb bf6a15e9b5
8 changed files with 367 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,3 @@
-> [!IMPORTANT]
-> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
-
 # MarkItDown

 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
@ -36,12 +33,20 @@ Or use `-o` to specify the output file:
 markitdown path-to-file.pdf -o document.md
 ```

+To use Document Intelligence conversion:
+
+```bash
+markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
+```
+
 You can also pipe content:

 ```bash
 cat path-to-file.pdf | markitdown
 ```

+More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
+
 ### Python API

 Basic usage in Python:
@ -54,6 +59,16 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```

+Document Intelligence conversion in Python:
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
+result = md.convert("test.pdf")
+print(result.text_content)
+```
+
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:

 ```python
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,14 +32,18 @@ dependencies = [
  "python-pptx",
  "pandas",
  "openpyxl",
+  "xlrd",
  "pdfminer.six",
  "puremagic",
  "pydub",
+  "olefile",
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
  "charset-normalizer",
  "openai",
+  "azure-ai-documentintelligence",
+  "azure-identity"
 ]

 [project.urls]
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@ -4,8 +4,8 @@
 import argparse
 import sys
 from textwrap import dedent
-from .__about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
+from __about__ import __version__
+from _markitdown import MarkItDown, DocumentConverterResult


 def main():
@ -57,15 +57,36 @@ def main():
        "--output",
        help="Output file name. If not provided, output is written to stdout.",
    )
+    parser.add_argument(
+        "-d",
+        "--use-docintel",
+        action="store_true",
+        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
+    )
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
    args = parser.parse_args()

-    if args.filename is None:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        _handle_output(args, result)
+    if args.use_docintel:
+        if args.endpoint is None:
+            raise ValueError(
+                "Document Intelligence Endpoint is required when using Document Intelligence."
+            )
+        elif args.filename is None:
+            raise ValueError("Filename is required when using Document Intelligence.")
+        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
    else:
        markitdown = MarkItDown()
+
+    if args.filename is None:
+        result = markitdown.convert_stream(sys.stdin.buffer)
+    else:
        result = markitdown.convert(args.filename)
+
    _handle_output(args, result)


--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings

 import mammoth
 import markdownify
+import olefile
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
@ -32,7 +33,21 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path

+# Azure imports
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
+    AnalyzeResult,
+    DocumentAnalysisFeature,
+)
+from azure.identity import DefaultAzureCredential
+
+# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
+# This constant is a temporary fix until the bug is resolved.
+CONTENT_FORMAT = "markdown"
+
 # Optional Transcription support
+IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
    # Using warnings' catch_warnings to catch
    # pydub's warning of ffmpeg or avconv missing
@ -171,7 +186,10 @@ class PlainTextConverter(DocumentConverter):
        # Only accept text files
        if content_type is None:
            return None
-        elif "text/" not in content_type.lower():
+        elif all(
+            not content_type.lower().startswith(type_prefix)
+            for type_prefix in ["text/", "application/json"]
+        ):
            return None

        text_content = str(from_path(local_path).best())
@ -724,7 +742,31 @@ class XlsxConverter(HtmlConverter):
        if extension.lower() != ".xlsx":
            return None

-        sheets = pd.read_excel(local_path, sheet_name=None)
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+
+class XlsConverter(HtmlConverter):
+    """
+    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLS
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".xls":
+            return None
+
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
@ -863,14 +905,25 @@ class MediaConverter(DocumentConverter):
    Abstract class for multi-modal media (e.g., images and audio)
    """

-    def _get_metadata(self, local_path):
-        exiftool = shutil.which("exiftool")
-        if not exiftool:
+    def _get_metadata(self, local_path, exiftool_path=None):
+        if not exiftool_path:
+            which_exiftool = shutil.which("exiftool")
+            if which_exiftool:
+                warn(
+                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
+
+    md = MarkItDown(exiftool_path="{which_exiftool}")
+
+This warning will be removed in future releases.
+""",
+                    DeprecationWarning,
+                )
+
            return None
        else:
            try:
                result = subprocess.run(
-                    [exiftool, "-json", local_path], capture_output=True, text=True
+                    [exiftool_path, "-json", local_path], capture_output=True, text=True
                ).stdout
                return json.loads(result)[0]
            except Exception:
@ -891,7 +944,7 @@ class WavConverter(MediaConverter):
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "Title",
@ -946,7 +999,7 @@ class Mp3Converter(WavConverter):
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "Title",
@ -1007,7 +1060,7 @@ class ImageConverter(MediaConverter):
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "ImageSize",
@ -1076,6 +1129,79 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content


+class OutlookMsgConverter(DocumentConverter):
+    """Converts Outlook .msg files to markdown by extracting email metadata and content.
+
+    Uses the olefile package to parse the .msg file structure and extract:
+    - Email headers (From, To, Subject)
+    - Email body content
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MSG file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".msg":
+            return None
+
+        try:
+            msg = olefile.OleFileIO(local_path)
+            # Extract email metadata
+            md_content = "# Email Message\n\n"
+
+            # Get headers
+            headers = {
+                "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
+                "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
+                "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
+            }
+
+            # Add headers to markdown
+            for key, value in headers.items():
+                if value:
+                    md_content += f"**{key}:** {value}\n"
+
+            md_content += "\n## Content\n\n"
+
+            # Get email body
+            body = self._get_stream_data(msg, "__substg1.0_1000001F")
+            if body:
+                md_content += body
+
+            msg.close()
+
+            return DocumentConverterResult(
+                title=headers.get("Subject"), text_content=md_content.strip()
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Could not convert MSG file '{local_path}': {str(e)}"
+            )
+
+    def _get_stream_data(
+        self, msg: olefile.OleFileIO, stream_path: str
+    ) -> Union[str, None]:
+        """Helper to safely extract and decode stream data from the MSG file."""
+        try:
+            if msg.exists(stream_path):
+                data = msg.openstream(stream_path).read()
+                # Try UTF-16 first (common for .msg files)
+                try:
+                    return data.decode("utf-16-le").strip()
+                except UnicodeDecodeError:
+                    # Fall back to UTF-8
+                    try:
+                        return data.decode("utf-8").strip()
+                    except UnicodeDecodeError:
+                        # Last resort - ignore errors
+                        return data.decode("utf-8", errors="ignore").strip()
+        except Exception:
+            pass
+        return None
+
+
 class ZipConverter(DocumentConverter):
    """Converts ZIP files to markdown by extracting and converting all contained files.

@ -1205,6 +1331,74 @@ class ZipConverter(DocumentConverter):
            )


+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint,
+            api_version=self.api_version,
+            credential=DefaultAzureCredential(),
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if extension is not supported by Document Intelligence
+        extension = kwargs.get("file_extension", "")
+        docintel_extensions = [
+            ".pdf",
+            ".docx",
+            ".xlsx",
+            ".pptx",
+            ".html",
+            ".jpeg",
+            ".jpg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".heif",
+        ]
+        if extension.lower() not in docintel_extensions:
+            return None
+
+        # Get the bytestring for the local path
+        with open(local_path, "rb") as f:
+            file_bytes = f.read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
+            ]
+
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
+
+
 class FileConversionException(BaseException):
    pass

@ -1223,6 +1417,8 @@ class MarkItDown:
        llm_client: Optional[Any] = None,
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
+        exiftool_path: Optional[str] = None,
+        docintel_endpoint: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
        mlm_model: Optional[str] = None,
@ -1232,6 +1428,9 @@ class MarkItDown:
        else:
            self._requests_session = requests_session

+        if exiftool_path is None:
+            exiftool_path = os.environ.get("EXIFTOOL_PATH")
+
        # Handle deprecation notices
        #############################
        if mlm_client is not None:
@ -1264,6 +1463,7 @@ class MarkItDown:
        self._llm_client = llm_client
        self._llm_model = llm_model
        self._style_map = style_map
+        self._exiftool_path = exiftool_path

        self._page_converters: List[DocumentConverter] = []

@ -1278,6 +1478,7 @@ class MarkItDown:
        self.register_page_converter(BingSerpConverter())
        self.register_page_converter(DocxConverter())
        self.register_page_converter(XlsxConverter())
+        self.register_page_converter(XlsConverter())
        self.register_page_converter(PptxConverter())
        self.register_page_converter(WavConverter())
        self.register_page_converter(Mp3Converter())
@ -1285,6 +1486,13 @@ class MarkItDown:
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
+        self.register_page_converter(OutlookMsgConverter())
+
+        # Register Document Intelligence converter at the top of the stack if endpoint is provided
+        if docintel_endpoint is not None:
+            self.register_page_converter(
+                DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+            )

    def convert(
        self, source: Union[str, requests.Response, Path], **kwargs: Any
@ -1445,12 +1653,15 @@ class MarkItDown:
                if "llm_model" not in _kwargs and self._llm_model is not None:
                    _kwargs["llm_model"] = self._llm_model

-                # Add the list of converters for nested processing
-                _kwargs["_parent_converters"] = self._page_converters
-
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map

+                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
+                    _kwargs["exiftool_path"] = self._exiftool_path
+
+                # Add the list of converters for nested processing
+                _kwargs["_parent_converters"] = self._page_converters
+
                # If we hit an error log it and keep trying
                try:
                    res = converter.convert(local_path, **_kwargs)
@ -1493,6 +1704,25 @@ class MarkItDown:
        # Use puremagic to guess
        try:
            guesses = puremagic.magic_file(path)
+
+            # Fix for: https://github.com/microsoft/markitdown/issues/222
+            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
+            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
+            # (space, tab, newline, carriage return, vertical tab, form feed).
+            if len(guesses) == 0:
+                with open(path, "rb") as file:
+                    while True:
+                        char = file.read(1)
+                        if not char:  # End of file
+                            break
+                        if not char.isspace():
+                            file.seek(file.tell() - 1)
+                            break
+                    try:
+                        guesses = puremagic.magic_stream(file)
+                    except puremagic.main.PureError:
+                        pass
+
            extensions = list()
            for g in guesses:
                ext = g.extension.strip()
--- a/tests/test_files/test.json
+++ b/tests/test_files/test.json
@ -0,0 +1,10 @@
+{
+    "key1": "string_value",
+    "key2": 1234,
+    "key3": [
+        "list_value1",
+        "list_value2"
+    ],
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
+    "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
+}
--- a/tests/test_files/test.xls
+++ b/tests/test_files/test.xls
--- a/tests/test_files/test_outlook_msg.msg
+++ b/tests/test_files/test_outlook_msg.msg
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -54,6 +54,12 @@ XLSX_TEST_STRINGS = [
    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 ]

+XLS_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
 DOCX_TEST_STRINGS = [
    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
    "49e168b7-d2ae-407f-a055-2167576f39a1",
@ -63,6 +69,15 @@ DOCX_TEST_STRINGS = [
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]

+MSG_TEST_STRINGS = [
+    "# Email Message",
+    "**From:** test.sender@example.com",
+    "**To:** test.recipient@example.com",
+    "**Subject:** Test Email Message",
+    "## Content",
+    "This is the body of the test email message",
+]
+
 DOCX_COMMENT_TEST_STRINGS = [
    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
    "49e168b7-d2ae-407f-a055-2167576f39a1",
@ -130,6 +145,11 @@ LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]

+JSON_TEST_STRINGS = [
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
+    "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
+]
+

 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
@ -176,6 +196,12 @@ def test_markitdown_local() -> None:
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
    validate_strings(result, XLSX_TEST_STRINGS)

+    # Test XLS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls"))
+    for test_string in XLS_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
    # Test DOCX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
    validate_strings(result, DOCX_TEST_STRINGS)
@ -232,15 +258,48 @@ def test_markitdown_local() -> None:
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
    validate_strings(result, CSV_CP932_TEST_STRINGS)

+    # Test MSG (Outlook email) processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
+    validate_strings(result, MSG_TEST_STRINGS)
+
+    # Test JSON processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
+    validate_strings(result, JSON_TEST_STRINGS)
+
+    # Test input with leading blank characters
+    input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
+    result = markitdown.convert_stream(io.BytesIO(input_data))
+    assert "# Test" in result.text_content
+

@pytest.mark.skipif(
    skip_exiftool,
    reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
+    # Test the automatic discovery of exiftool throws a warning
+    # and is disabled
+    try:
+        with catch_warnings(record=True) as w:
            markitdown = MarkItDown()
+            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert result.text_content.strip() == ""
+    finally:
+        resetwarnings()

-    # Test JPG metadata processing
+    # Test explicitly setting the location of exiftool
+    which_exiftool = shutil.which("exiftool")
+    markitdown = MarkItDown(exiftool_path=which_exiftool)
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+    # Test setting the exiftool path through an environment variable
+    os.environ["EXIFTOOL_PATH"] = which_exiftool
+    markitdown = MarkItDown()
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
@ -302,8 +361,8 @@ def test_markitdown_llm() -> None:

 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
-    test_markitdown_remote()
-    test_markitdown_local()
+    # test_markitdown_remote()
+    # test_markitdown_local()
    test_markitdown_exiftool()
-    test_markitdown_deprecation()
-    test_markitdown_llm()
+    # test_markitdown_deprecation()
+    # test_markitdown_llm()