From 811e4413aa744d29d90741ed9747c15701ff62ca Mon Sep 17 00:00:00 2001
From: Kenny Zhang <kzhang678@gmail.com>
Date: Thu, 9 Jan 2025 15:27:03 -0500
Subject: [PATCH] added isolated doc_intel main conversion function

---
 src/markitdown/_markitdown.py | 149 ++++++++++++++++++++++------------
 1 file changed, 98 insertions(+), 51 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index d2c8a0f..1dd5b6b 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -167,54 +167,6 @@ class DocumentConverter:
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
         raise NotImplementedError()
-    
-class DocumentIntelligenceConverter(DocumentConverter):
-    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
-
-    def __init__(
-        self,
-        endpoint: str,
-        api_version: str = "2024-07-31-preview",
-    ):
-        self.endpoint = endpoint
-        self.api_version = api_version
-        self.doc_intel_client = DocumentIntelligenceClient(
-            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
-        )
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        extension = kwargs.get("file_extension", "")
-
-        # Get the bytestring for the local path
-        file_bytes = open(local_path, "rb").read()
-
-        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
-        if extension.lower() in [".xlsx", ".pptx", ".html"]:
-            analysis_features = []
-        else:
-            analysis_features = [
-                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
-                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
-                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
-            ]
-        
-        # Extract the text using Azure Document Intelligence
-        poller = self.doc_intel_client.begin_analyze_document(
-            model_id="prebuilt-layout",
-            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
-            features=analysis_features,
-            output_content_format=ContentFormat.MARKDOWN,
-        )
-        result: AnalyzeResult = poller.result()
-
-        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
-        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
-        return DocumentConverterResult(
-            title=None,
-            text_content=markdown_text,
-        )
 
 class PlainTextConverter(DocumentConverter):
     """Anything with content type text/plain"""
@@ -1373,7 +1325,58 @@ class ZipConverter(DocumentConverter):
                 title=None,
                 text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
             )
+        
+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
 
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if extension is not supported by Document Intelligence
+        extension = kwargs.get("file_extension", "")
+        docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
+        if extension.lower() not in docintel_extensions:
+            return None
+
+        # Get the bytestring for the local path
+        file_bytes = open(local_path, "rb").read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
+            ]
+        
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=ContentFormat.MARKDOWN,
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
 
 class FileConversionException(BaseException):
     pass
@@ -1441,9 +1444,8 @@ class MarkItDown:
         self._style_map = style_map
         self._exiftool_path = exiftool_path
 
-        self._docintel_endpoint = docintel_endpoint
-        if self._docintel_endpoint is not None:
-            self._docintel_client = DocumentIntelligenceConverter(endpoint=self._docintel_endpoint)
+        if docintel_endpoint is not None:
+            self._docintel_converter = DocumentIntelligenceConverter(endpoint=docintel_endpoint)
 
         self._page_converters: List[DocumentConverter] = []
 
@@ -1510,6 +1512,8 @@ class MarkItDown:
             self._append_ext(extensions, g)
 
         # Convert
+        if self._docintel_converter is not None:
+            return self._convert_docintel(path, extensions, **kwargs)
         return self._convert(path, extensions, **kwargs)
 
     # TODO what should stream's type be?
@@ -1594,6 +1598,8 @@ class MarkItDown:
                 self._append_ext(extensions, g)
 
             # Convert
+            if self._docintel_converter is not None:
+                result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
             result = self._convert(temp_path, extensions, url=response.url, **kwargs)
         # Clean up
         finally:
@@ -1662,6 +1668,47 @@ class MarkItDown:
         raise UnsupportedFormatException(
             f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
         )
+    
+    def _convert_docintel(
+            self, local_path: str, extensions: List[Union[str, None]], **kwargs
+    ) -> DocumentConverterResult:
+        error_trace = ""
+        for ext in extensions + [None]:  # Try last with no extension
+            _kwargs = copy.deepcopy(kwargs)
+
+            # Overwrite file_extension appropriately
+            if ext is None:
+                if "file_extension" in _kwargs:
+                    del _kwargs["file_extension"]
+            else:
+                _kwargs.update({"file_extension": ext})
+
+             # If we hit an error log it and keep trying
+            try:
+                res = self._docintel_converter.convert(local_path, **_kwargs)
+            except Exception:
+                error_trace = ("\n\n" + traceback.format_exc()).strip()
+
+            if res is not None:
+                # Normalize the content
+                res.text_content = "\n".join(
+                    [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
+                )
+                res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
+
+                # Todo
+                return res
+            
+        # If we got this far without success, report any exceptions
+        if len(error_trace) > 0:
+            raise FileConversionException(
+                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+            )
+
+        # Extension not supported by Document Intelligence
+        raise UnsupportedFormatException(
+            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported by Document Intelligence."
+        )
 
     def _append_ext(self, extensions, ext):
         """Append a unique non-None, non-empty extension to a list of extensions."""