From 06080eb2e85a5b1b4886b5d32a71b26b3fe3e92d Mon Sep 17 00:00:00 2001
From: Kenny Zhang <kzhang678@gmail.com>
Date: Thu, 9 Jan 2025 14:41:14 -0500
Subject: [PATCH] added DocumentIntelligenceConverter class implementation

---
 src/markitdown/__main__.py    |  6 ++--
 src/markitdown/_markitdown.py | 59 +++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index 45652f1..5bc9ede 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -4,8 +4,8 @@
 import argparse
 import sys
 from textwrap import dedent
-from .__about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
+from __about__ import __version__
+from _markitdown import MarkItDown, DocumentConverterResult
 
 
 def main():
@@ -76,7 +76,7 @@ def main():
             raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.")
         elif args.filename is None:
             raise ValueError("Filename is required when using Document Intelligence.")
-        markitdown = MarkItDown(endpoint=args.endpoint)
+        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
     else:
         markitdown = MarkItDown()
 
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 33806e1..00052ba 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -33,6 +33,16 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path
 
+# Azure imports
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
+    AnalyzeResult,
+    ContentFormat,
+    DocumentAnalysisFeature,
+)
+from azure.identity import DefaultAzureCredential
+
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@@ -157,7 +167,54 @@ class DocumentConverter:
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         raise NotImplementedError()
+    
+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
 
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        extension = kwargs.get("file_extension", "")
+
+        # Get the bytestring for the local path
+        file_bytes = open(local_path, "rb").read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
+            ]
+        
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=ContentFormat.MARKDOWN,
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
 
 class PlainTextConverter(DocumentConverter):
     """Anything with content type text/plain"""
@@ -1337,6 +1394,7 @@ class MarkItDown:
         llm_model: Optional[str] = None,
         style_map: Optional[str] = None,
         exiftool_path: Optional[str] = None,
+        docintel_endpoint: Optional[str] = None,
         # Deprecated
         mlm_client: Optional[Any] = None,
         mlm_model: Optional[str] = None,
@@ -1382,6 +1440,7 @@ class MarkItDown:
         self._llm_model = llm_model
         self._style_map = style_map
         self._exiftool_path = exiftool_path
+        self._docintel_endpoint = docintel_endpoint
 
         self._page_converters: List[DocumentConverter] = []