added isolated doc_intel main conversion function

2025-01-09 15:27:03 -05:00 · 2025-01-09 15:27:03 -05:00 · 811e4413aa
commit 811e4413aa
parent 62a0d6c082
1 changed files with 98 additions and 51 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -168,54 +168,6 @@ class DocumentConverter:
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError()
 class DocumentIntelligenceConverter(DocumentConverter):
    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
    def __init__(
        self,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
        )
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        extension = kwargs.get("file_extension", "")
        # Get the bytestring for the local path
        file_bytes = open(local_path, "rb").read()
        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
        if extension.lower() in [".xlsx", ".pptx", ".html"]:
            analysis_features = []
        else:
            analysis_features = [
                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
            ]
        # Extract the text using Azure Document Intelligence
        poller = self.doc_intel_client.begin_analyze_document(
            model_id="prebuilt-layout",
            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
            features=analysis_features,
            output_content_format=ContentFormat.MARKDOWN,
        )
        result: AnalyzeResult = poller.result()
        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
        return DocumentConverterResult(
            title=None,
            text_content=markdown_text,
        )
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
@ -1374,6 +1326,57 @@ class ZipConverter(DocumentConverter):
                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )
 class DocumentIntelligenceConverter(DocumentConverter):
    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
    def __init__(
        self,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
        )
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if extension is not supported by Document Intelligence
        extension = kwargs.get("file_extension", "")
        docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
        if extension.lower() not in docintel_extensions:
            return None
        # Get the bytestring for the local path
        file_bytes = open(local_path, "rb").read()
        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
        if extension.lower() in [".xlsx", ".pptx", ".html"]:
            analysis_features = []
        else:
            analysis_features = [
                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
            ]
        # Extract the text using Azure Document Intelligence
        poller = self.doc_intel_client.begin_analyze_document(
            model_id="prebuilt-layout",
            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
            features=analysis_features,
            output_content_format=ContentFormat.MARKDOWN,
        )
        result: AnalyzeResult = poller.result()
        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
        return DocumentConverterResult(
            title=None,
            text_content=markdown_text,
        )
 class FileConversionException(BaseException):
    pass
@ -1441,9 +1444,8 @@ class MarkItDown:
        self._style_map = style_map
        self._exiftool_path = exiftool_path
-        self._docintel_endpoint = docintel_endpoint
+        if docintel_endpoint is not None:
-        if self._docintel_endpoint is not None:
+            self._docintel_converter = DocumentIntelligenceConverter(endpoint=docintel_endpoint)
            self._docintel_client = DocumentIntelligenceConverter(endpoint=self._docintel_endpoint)
        self._page_converters: List[DocumentConverter] = []
@ -1510,6 +1512,8 @@ class MarkItDown:
            self._append_ext(extensions, g)
        # Convert
        if self._docintel_converter is not None:
            return self._convert_docintel(path, extensions, **kwargs)
        return self._convert(path, extensions, **kwargs)
    # TODO what should stream's type be?
@ -1594,6 +1598,8 @@ class MarkItDown:
                self._append_ext(extensions, g)
            # Convert
            if self._docintel_converter is not None:
                result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
@ -1663,6 +1669,47 @@ class MarkItDown:
            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
        )
    def _convert_docintel(
            self, local_path: str, extensions: List[Union[str, None]], **kwargs
    ) -> DocumentConverterResult:
        error_trace = ""
        for ext in extensions + [None]:  # Try last with no extension
            _kwargs = copy.deepcopy(kwargs)
            # Overwrite file_extension appropriately
            if ext is None:
                if "file_extension" in _kwargs:
                    del _kwargs["file_extension"]
            else:
                _kwargs.update({"file_extension": ext})
             # If we hit an error log it and keep trying
            try:
                res = self._docintel_converter.convert(local_path, **_kwargs)
            except Exception:
                error_trace = ("\n\n" + traceback.format_exc()).strip()
            if res is not None:
                # Normalize the content
                res.text_content = "\n".join(
                    [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                )
                res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
                # Todo
                return res
        # If we got this far without success, report any exceptions
        if len(error_trace) > 0:
            raise FileConversionException(
                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
            )
        # Extension not supported by Document Intelligence
        raise UnsupportedFormatException(
            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported by Document Intelligence."
        )
    def _append_ext(self, extensions, ext):
        """Append a unique non-None, non-empty extension to a list of extensions."""
        if ext is None: