From 811e4413aa744d29d90741ed9747c15701ff62ca Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 9 Jan 2025 15:27:03 -0500 Subject: [PATCH] added isolated doc_intel main conversion function --- src/markitdown/_markitdown.py | 149 ++++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 51 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d2c8a0f..1dd5b6b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -167,54 +167,6 @@ class DocumentConverter: self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: raise NotImplementedError() - -class DocumentIntelligenceConverter(DocumentConverter): - """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" - - def __init__( - self, - endpoint: str, - api_version: str = "2024-07-31-preview", - ): - self.endpoint = endpoint - self.api_version = api_version - self.doc_intel_client = DocumentIntelligenceClient( - endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential() - ) - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - extension = kwargs.get("file_extension", "") - - # Get the bytestring for the local path - file_bytes = open(local_path, "rb").read() - - # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) - if extension.lower() in [".xlsx", ".pptx", ".html"]: - analysis_features = [] - else: - analysis_features = [ - DocumentAnalysisFeature.FORMULAS, # enable formula extraction - DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR - DocumentAnalysisFeature.STYLE_FONT # enable font style extraction - ] - - # Extract the text using Azure Document Intelligence - poller = self.doc_intel_client.begin_analyze_document( - model_id="prebuilt-layout", - analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes), - features=analysis_features, - output_content_format=ContentFormat.MARKDOWN, - ) - result: AnalyzeResult = poller.result() - - # remove comments from the markdown content generated by Doc Intelligence and append to markdown string - markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) - return DocumentConverterResult( - title=None, - text_content=markdown_text, - ) class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" @@ -1373,7 +1325,58 @@ class ZipConverter(DocumentConverter): title=None, text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", ) + +class DocumentIntelligenceConverter(DocumentConverter): + """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" + def __init__( + self, + endpoint: str, + api_version: str = "2024-07-31-preview", + ): + self.endpoint = endpoint + self.api_version = api_version + self.doc_intel_client = DocumentIntelligenceClient( + endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential() + ) + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if extension is not supported by Document Intelligence + extension = kwargs.get("file_extension", "") + docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"] + if extension.lower() not in docintel_extensions: + return None + + # Get the bytestring for the local path + file_bytes = open(local_path, "rb").read() + + # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) + if extension.lower() in [".xlsx", ".pptx", ".html"]: + analysis_features = [] + else: + analysis_features = [ + DocumentAnalysisFeature.FORMULAS, # enable formula extraction + DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR + DocumentAnalysisFeature.STYLE_FONT # enable font style extraction + ] + + # Extract the text using Azure Document Intelligence + poller = self.doc_intel_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes), + features=analysis_features, + output_content_format=ContentFormat.MARKDOWN, + ) + result: AnalyzeResult = poller.result() + + # remove comments from the markdown content generated by Doc Intelligence and append to markdown string + markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) + return DocumentConverterResult( + title=None, + text_content=markdown_text, + ) class FileConversionException(BaseException): pass @@ -1441,9 +1444,8 @@ class MarkItDown: self._style_map = style_map self._exiftool_path = exiftool_path - self._docintel_endpoint = docintel_endpoint - if self._docintel_endpoint is not None: - self._docintel_client = DocumentIntelligenceConverter(endpoint=self._docintel_endpoint) + if docintel_endpoint is not None: + self._docintel_converter = DocumentIntelligenceConverter(endpoint=docintel_endpoint) self._page_converters: List[DocumentConverter] = [] @@ -1510,6 +1512,8 @@ class MarkItDown: self._append_ext(extensions, g) # Convert + if self._docintel_converter is not None: + return self._convert_docintel(path, extensions, **kwargs) return self._convert(path, extensions, **kwargs) # TODO what should stream's type be? @@ -1594,6 +1598,8 @@ class MarkItDown: self._append_ext(extensions, g) # Convert + if self._docintel_converter is not None: + result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs) result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: @@ -1662,6 +1668,47 @@ class MarkItDown: raise UnsupportedFormatException( f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported." ) + + def _convert_docintel( + self, local_path: str, extensions: List[Union[str, None]], **kwargs + ) -> DocumentConverterResult: + error_trace = "" + for ext in extensions + [None]: # Try last with no extension + _kwargs = copy.deepcopy(kwargs) + + # Overwrite file_extension appropriately + if ext is None: + if "file_extension" in _kwargs: + del _kwargs["file_extension"] + else: + _kwargs.update({"file_extension": ext}) + + # If we hit an error log it and keep trying + try: + res = self._docintel_converter.convert(local_path, **_kwargs) + except Exception: + error_trace = ("\n\n" + traceback.format_exc()).strip() + + if res is not None: + # Normalize the content + res.text_content = "\n".join( + [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] + ) + res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) + + # Todo + return res + + # If we got this far without success, report any exceptions + if len(error_trace) > 0: + raise FileConversionException( + f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}" + ) + + # Extension not supported by Document Intelligence + raise UnsupportedFormatException( + f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported by Document Intelligence." + ) def _append_ext(self, extensions, ext): """Append a unique non-None, non-empty extension to a list of extensions."""