added DocumentIntelligenceConverter class implementation

2025-01-09 14:41:14 -05:00 · 2025-01-09 14:41:14 -05:00 · 06080eb2e8
commit 06080eb2e8
parent d6debbdaf7
2 changed files with 62 additions and 3 deletions
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@ -4,8 +4,8 @@
 import argparse
 import sys
 from textwrap import dedent
-from .__about__ import __version__
+from __about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
+from _markitdown import MarkItDown, DocumentConverterResult
 def main():
@ -76,7 +76,7 @@ def main():
            raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.")
        elif args.filename is None:
            raise ValueError("Filename is required when using Document Intelligence.")
-        markitdown = MarkItDown(endpoint=args.endpoint)
+        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
    else:
        markitdown = MarkItDown()
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -33,6 +33,16 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path
 # Azure imports
 from azure.ai.documentintelligence import DocumentIntelligenceClient
 from azure.ai.documentintelligence.models import (
    AnalyzeDocumentRequest,
    AnalyzeResult,
    ContentFormat,
    DocumentAnalysisFeature,
 )
 from azure.identity import DefaultAzureCredential
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@ -157,7 +167,54 @@ class DocumentConverter:
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError()
 class DocumentIntelligenceConverter(DocumentConverter):
    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
    def __init__(
        self,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
        )
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        extension = kwargs.get("file_extension", "")
        # Get the bytestring for the local path
        file_bytes = open(local_path, "rb").read()
        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
        if extension.lower() in [".xlsx", ".pptx", ".html"]:
            analysis_features = []
        else:
            analysis_features = [
                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
            ]
        # Extract the text using Azure Document Intelligence
        poller = self.doc_intel_client.begin_analyze_document(
            model_id="prebuilt-layout",
            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
            features=analysis_features,
            output_content_format=ContentFormat.MARKDOWN,
        )
        result: AnalyzeResult = poller.result()
        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
        return DocumentConverterResult(
            title=None,
            text_content=markdown_text,
        )
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
@ -1337,6 +1394,7 @@ class MarkItDown:
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        exiftool_path: Optional[str] = None,
        docintel_endpoint: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
        mlm_model: Optional[str] = None,
@ -1382,6 +1440,7 @@ class MarkItDown:
        self._llm_model = llm_model
        self._style_map = style_map
        self._exiftool_path = exiftool_path
        self._docintel_endpoint = docintel_endpoint
        self._page_converters: List[DocumentConverter] = []