diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 5bc9ede..69e8f0e 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -73,7 +73,9 @@ def main(): if args.use_docintel: if args.endpoint is None: - raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.") + raise ValueError( + "Document Intelligence Endpoint is required when using Document Intelligence." + ) elif args.filename is None: raise ValueError("Filename is required when using Document Intelligence.") markitdown = MarkItDown(docintel_endpoint=args.endpoint) @@ -87,6 +89,7 @@ def main(): _handle_output(args, result) + def _handle_output(args, result: DocumentConverterResult): """Handle output to stdout or file""" if args.output: diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index b14b244..ae6a7b4 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -41,6 +41,7 @@ from azure.ai.documentintelligence.models import ( DocumentAnalysisFeature, ) from azure.identity import DefaultAzureCredential + # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # This constant is a temporary fix until the bug is resolved. CONTENT_FORMAT = "markdown" @@ -170,6 +171,7 @@ class DocumentConverter: ) -> Union[None, DocumentConverterResult]: raise NotImplementedError() + class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" @@ -1327,7 +1329,8 @@ class ZipConverter(DocumentConverter): title=None, text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", ) - + + class DocumentIntelligenceConverter(DocumentConverter): """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" @@ -1339,7 +1342,9 @@ class DocumentIntelligenceConverter(DocumentConverter): self.endpoint = endpoint self.api_version = api_version self.doc_intel_client = DocumentIntelligenceClient( - endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential() + endpoint=self.endpoint, + api_version=self.api_version, + credential=DefaultAzureCredential(), ) def convert( @@ -1347,7 +1352,19 @@ class DocumentIntelligenceConverter(DocumentConverter): ) -> Union[None, DocumentConverterResult]: # Bail if extension is not supported by Document Intelligence extension = kwargs.get("file_extension", "") - docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"] + docintel_extensions = [ + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".html", + ".jpeg", + ".jpg", + ".png", + ".bmp", + ".tiff", + ".heif", + ] if extension.lower() not in docintel_extensions: return None @@ -1362,15 +1379,15 @@ class DocumentIntelligenceConverter(DocumentConverter): analysis_features = [ DocumentAnalysisFeature.FORMULAS, # enable formula extraction DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR - DocumentAnalysisFeature.STYLE_FONT # enable font style extraction + DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction ] - + # Extract the text using Azure Document Intelligence poller = self.doc_intel_client.begin_analyze_document( model_id="prebuilt-layout", body=AnalyzeDocumentRequest(bytes_source=file_bytes), features=analysis_features, - output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed + output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed ) result: AnalyzeResult = poller.result() @@ -1381,6 +1398,7 @@ class DocumentIntelligenceConverter(DocumentConverter): text_content=markdown_text, ) + class FileConversionException(BaseException): pass @@ -1472,7 +1490,9 @@ class MarkItDown: # Register Document Intelligence converter at the top of the stack if endpoint is provided if docintel_endpoint is not None: - self.register_page_converter(DocumentIntelligenceConverter(endpoint=docintel_endpoint)) + self.register_page_converter( + DocumentIntelligenceConverter(endpoint=docintel_endpoint) + ) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any