Added more fine-grained options over types.

2025-03-24 14:02:55 -07:00 · 2025-03-24 14:02:55 -07:00 · d4f0e7f662
commit d4f0e7f662
parent f8a974f8a1
2 changed files with 104 additions and 34 deletions
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
 from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
-from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._doc_intel_converter import (
    DocumentIntelligenceConverter,
    DocumentIntelligenceFileType,
 )
 from ._epub_converter import EpubConverter
 __all__ = [
@ -38,5 +41,6 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
    "DocumentIntelligenceFileType",
    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -3,6 +3,7 @@ import re
 import os
 from typing import BinaryIO, Any, List
 from enum import Enum
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
@ -31,38 +32,74 @@ except ImportError:
 CONTENT_FORMAT = "markdown"
-OFFICE_MIME_TYPE_PREFIXES = [
+class DocumentIntelligenceFileType(str, Enum):
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    """Enum of file types supported by the Document Intelligence Converter."""
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "application/vnd.openxmlformats-officedocument.presentationml",
    "application/xhtml",
    "text/html",
 ]
-OTHER_MIME_TYPE_PREFIXES = [
+    # No OCR
-    "application/pdf",
+    DOCX = "docx"
-    "application/x-pdf",
+    PPTX = "pptx"
-    "text/html",
+    XLSX = "xlsx"
-    "image/",
+    HTML = "html"
-]
+    # OCR
    PDF = "pdf"
    JPEG = "jpeg"
    PNG = "png"
    BMP = "bmp"
    TIFF = "tiff"
 OFFICE_FILE_EXTENSIONS = [
    ".docx",
    ".xlsx",
    ".pptx",
    ".html",
    ".htm",
 ]
-OTHER_FILE_EXTENSIONS = [
+def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
-    ".pdf",
+    """Get the MIME type prefixes for the given file types."""
-    ".jpeg",
+    prefixes: List[str] = []
-    ".jpg",
+    for type_ in types:
-    ".png",
+        if type_ == DocumentIntelligenceFileType.DOCX:
-    ".bmp",
+            prefixes.append(
-    ".tiff",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-    ".heif",
+            )
-]
+        elif type_ == DocumentIntelligenceFileType.PPTX:
            prefixes.append(
                "application/vnd.openxmlformats-officedocument.presentationml"
            )
        elif type_ == DocumentIntelligenceFileType.XLSX:
            prefixes.append(
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        elif type_ == DocumentIntelligenceFileType.PDF:
            prefixes.append("application/pdf")
            prefixes.append("application/x-pdf")
        elif type_ == DocumentIntelligenceFileType.JPEG:
            prefixes.append("image/jpeg")
        elif type_ == DocumentIntelligenceFileType.PNG:
            prefixes.append("image/png")
        elif type_ == DocumentIntelligenceFileType.BMP:
            prefixes.append("image/bmp")
        elif type_ == DocumentIntelligenceFileType.TIFF:
            prefixes.append("image/tiff")
    return prefixes
 def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
    """Get the file extensions for the given file types."""
    extensions: List[str] = []
    for type_ in types:
        if type_ == DocumentIntelligenceFileType.DOCX:
            extensions.append(".docx")
        elif type_ == DocumentIntelligenceFileType.PPTX:
            extensions.append(".pptx")
        elif type_ == DocumentIntelligenceFileType.XLSX:
            extensions.append(".xlsx")
        elif type_ == DocumentIntelligenceFileType.PDF:
            extensions.append(".pdf")
        elif type_ == DocumentIntelligenceFileType.JPEG:
            extensions.append(".jpg")
            extensions.append(".jpeg")
        elif type_ == DocumentIntelligenceFileType.PNG:
            extensions.append(".png")
        elif type_ == DocumentIntelligenceFileType.BMP:
            extensions.append(".bmp")
        elif type_ == DocumentIntelligenceFileType.TIFF:
            extensions.append(".tiff")
    return extensions
 class DocumentIntelligenceConverter(DocumentConverter):
@ -74,8 +111,29 @@ class DocumentIntelligenceConverter(DocumentConverter):
        endpoint: str,
        api_version: str = "2024-07-31-preview",
        credential: AzureKeyCredential | TokenCredential | None = None,
        file_types: List[DocumentIntelligenceFileType] = [
            DocumentIntelligenceFileType.DOCX,
            DocumentIntelligenceFileType.PPTX,
            DocumentIntelligenceFileType.XLSX,
            DocumentIntelligenceFileType.PDF,
            DocumentIntelligenceFileType.JPEG,
            DocumentIntelligenceFileType.PNG,
            DocumentIntelligenceFileType.BMP,
            DocumentIntelligenceFileType.TIFF,
        ],
    ):
        """
        Initialize the DocumentIntelligenceConverter.
        Args:
            endpoint (str): The endpoint for the Document Intelligence service.
            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
            credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
            file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
        """
        super().__init__()
        self._file_types = file_types
        # Raise an error if the dependencies are not available.
        # This is different than other converters since this one isn't even instantiated
@ -112,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
-        if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
+        if extension in _get_file_extensions(self._file_types):
            return True
-        for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(self._file_types):
            if mimetype.startswith(prefix):
                return True
@ -130,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
-        if extension in OFFICE_FILE_EXTENSIONS:
+        # Types that don't support ocr
        no_ocr_types = [
            DocumentIntelligenceFileType.DOCX,
            DocumentIntelligenceFileType.PPTX,
            DocumentIntelligenceFileType.XLSX,
            DocumentIntelligenceFileType.HTML,
        ]
        if extension in _get_file_extensions(no_ocr_types):
            return []
-        for prefix in OFFICE_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(no_ocr_types):
            if mimetype.startswith(prefix):
                return []