Merge branch 'main' into patch-2

2025-02-08 20:27:29 -08:00 · 2025-02-08 20:27:29 -08:00 · 621e96ad3f
commit 621e96ad3f
parent 08a45fa4bd bf6a15e9b5
5 changed files with 191 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,3 @@
 > [!IMPORTANT]
 > (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
 # MarkItDown
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
@ -36,12 +33,20 @@ Or use `-o` to specify the output file:
 markitdown path-to-file.pdf -o document.md
 ```
 To use Document Intelligence conversion:
 ```bash
 markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
 ```
 You can also pipe content:
 ```bash
 cat path-to-file.pdf | markitdown
 ```
 More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
 ### Python API
 Basic usage in Python:
@ -54,6 +59,16 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```
 Document Intelligence conversion in Python:
 ```python
 from markitdown import MarkItDown
 md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
 result = md.convert("test.pdf")
 print(result.text_content)
 ```
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 ```python
--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,6 +42,8 @@ dependencies = [
  "pathvalidate",
  "charset-normalizer",
  "openai",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
 [project.urls]
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@ -4,8 +4,8 @@
 import argparse
 import sys
 from textwrap import dedent
-from .__about__ import __version__
+from __about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
+from _markitdown import MarkItDown, DocumentConverterResult
 def main():
@ -57,15 +57,36 @@ def main():
        "--output",
        help="Output file name. If not provided, output is written to stdout.",
    )
    parser.add_argument(
        "-d",
        "--use-docintel",
        action="store_true",
        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
    )
    parser.add_argument(
        "-e",
        "--endpoint",
        type=str,
        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
    )
    args = parser.parse_args()
-    if args.filename is None:
+    if args.use_docintel:
-        markitdown = MarkItDown()
+        if args.endpoint is None:
-        result = markitdown.convert_stream(sys.stdin.buffer)
+            raise ValueError(
-        _handle_output(args, result)
+                "Document Intelligence Endpoint is required when using Document Intelligence."
            )
        elif args.filename is None:
            raise ValueError("Filename is required when using Document Intelligence.")
        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
    else:
        markitdown = MarkItDown()
    if args.filename is None:
        result = markitdown.convert_stream(sys.stdin.buffer)
    else:
        result = markitdown.convert(args.filename)
    _handle_output(args, result)
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -33,6 +33,19 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path
 # Azure imports
 from azure.ai.documentintelligence import DocumentIntelligenceClient
 from azure.ai.documentintelligence.models import (
    AnalyzeDocumentRequest,
    AnalyzeResult,
    DocumentAnalysisFeature,
 )
 from azure.identity import DefaultAzureCredential
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
 CONTENT_FORMAT = "markdown"
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@ -895,14 +908,25 @@ class MediaConverter(DocumentConverter):
    Abstract class for multi-modal media (e.g., images and audio)
    """
-    def _get_metadata(self, local_path):
+    def _get_metadata(self, local_path, exiftool_path=None):
-        exiftool = shutil.which("exiftool")
+        if not exiftool_path:
-        if not exiftool:
+            which_exiftool = shutil.which("exiftool")
            if which_exiftool:
                warn(
                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
    md = MarkItDown(exiftool_path="{which_exiftool}")
 This warning will be removed in future releases.
 """,
                    DeprecationWarning,
                )
            return None
        else:
            try:
                result = subprocess.run(
-                    [exiftool, "-json", local_path], capture_output=True, text=True
+                    [exiftool_path, "-json", local_path], capture_output=True, text=True
                ).stdout
                return json.loads(result)[0]
            except Exception:
@ -923,7 +947,7 @@ class WavConverter(MediaConverter):
        md_content = ""
        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "Title",
@ -978,7 +1002,7 @@ class Mp3Converter(WavConverter):
        md_content = ""
        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "Title",
@ -1039,7 +1063,7 @@ class ImageConverter(MediaConverter):
        md_content = ""
        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "ImageSize",
@ -1310,6 +1334,74 @@ class ZipConverter(DocumentConverter):
            )
 class DocumentIntelligenceConverter(DocumentConverter):
    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
    def __init__(
        self,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
            endpoint=self.endpoint,
            api_version=self.api_version,
            credential=DefaultAzureCredential(),
        )
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if extension is not supported by Document Intelligence
        extension = kwargs.get("file_extension", "")
        docintel_extensions = [
            ".pdf",
            ".docx",
            ".xlsx",
            ".pptx",
            ".html",
            ".jpeg",
            ".jpg",
            ".png",
            ".bmp",
            ".tiff",
            ".heif",
        ]
        if extension.lower() not in docintel_extensions:
            return None
        # Get the bytestring for the local path
        with open(local_path, "rb") as f:
            file_bytes = f.read()
        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
        if extension.lower() in [".xlsx", ".pptx", ".html"]:
            analysis_features = []
        else:
            analysis_features = [
                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
            ]
        # Extract the text using Azure Document Intelligence
        poller = self.doc_intel_client.begin_analyze_document(
            model_id="prebuilt-layout",
            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
            features=analysis_features,
            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
        )
        result: AnalyzeResult = poller.result()
        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
        return DocumentConverterResult(
            title=None,
            text_content=markdown_text,
        )
 class FileConversionException(BaseException):
    pass
@ -1328,6 +1420,8 @@ class MarkItDown:
        llm_client: Optional[Any] = None,
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        exiftool_path: Optional[str] = None,
        docintel_endpoint: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
        mlm_model: Optional[str] = None,
@ -1337,6 +1431,9 @@ class MarkItDown:
        else:
            self._requests_session = requests_session
        if exiftool_path is None:
            exiftool_path = os.environ.get("EXIFTOOL_PATH")
        # Handle deprecation notices
        #############################
        if mlm_client is not None:
@ -1369,6 +1466,7 @@ class MarkItDown:
        self._llm_client = llm_client
        self._llm_model = llm_model
        self._style_map = style_map
        self._exiftool_path = exiftool_path
        self._page_converters: List[DocumentConverter] = []
@ -1393,6 +1491,12 @@ class MarkItDown:
        self.register_page_converter(ZipConverter())
        self.register_page_converter(OutlookMsgConverter())
        # Register Document Intelligence converter at the top of the stack if endpoint is provided
        if docintel_endpoint is not None:
            self.register_page_converter(
                DocumentIntelligenceConverter(endpoint=docintel_endpoint)
            )
    def convert(
        self, source: Union[str, requests.Response, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
@ -1552,12 +1656,15 @@ class MarkItDown:
                if "llm_model" not in _kwargs and self._llm_model is not None:
                    _kwargs["llm_model"] = self._llm_model
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
                    _kwargs["exiftool_path"] = self._exiftool_path
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters
                # If we hit an error log it and keep trying
                try:
                    res = converter.convert(local_path, **_kwargs)
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -277,9 +277,29 @@ def test_markitdown_local() -> None:
    reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
    # Test the automatic discovery of exiftool throws a warning
    # and is disabled
    try:
        with catch_warnings(record=True) as w:
            markitdown = MarkItDown()
            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert result.text_content.strip() == ""
    finally:
        resetwarnings()
-    # Test JPG metadata processing
+    # Test explicitly setting the location of exiftool
    which_exiftool = shutil.which("exiftool")
    markitdown = MarkItDown(exiftool_path=which_exiftool)
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
        assert target in result.text_content
    # Test setting the exiftool path through an environment variable
    os.environ["EXIFTOOL_PATH"] = which_exiftool
    markitdown = MarkItDown()
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
@ -341,8 +361,8 @@ def test_markitdown_llm() -> None:
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
-    test_markitdown_remote()
+    # test_markitdown_remote()
-    test_markitdown_local()
+    # test_markitdown_local()
    test_markitdown_exiftool()
-    test_markitdown_deprecation()
+    # test_markitdown_deprecation()
-    test_markitdown_llm()
+    # test_markitdown_llm()