formatting changes

2025-01-23 17:47:30 -05:00 · 2025-01-23 17:47:30 -05:00 · 46c4890bb4
commit 46c4890bb4
parent 1310bd48ad
2 changed files with 31 additions and 8 deletions
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@ -73,7 +73,9 @@ def main():
    if args.use_docintel:
        if args.endpoint is None:
-            raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.")
+            raise ValueError(
                "Document Intelligence Endpoint is required when using Document Intelligence."
            )
        elif args.filename is None:
            raise ValueError("Filename is required when using Document Intelligence.")
        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
@ -87,6 +89,7 @@ def main():
    _handle_output(args, result)
 def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -41,6 +41,7 @@ from azure.ai.documentintelligence.models import (
    DocumentAnalysisFeature,
 )
 from azure.identity import DefaultAzureCredential
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
 CONTENT_FORMAT = "markdown"
@ -170,6 +171,7 @@ class DocumentConverter:
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError()
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
@ -1327,7 +1329,8 @@ class ZipConverter(DocumentConverter):
                title=None,
                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )
-        
+
 class DocumentIntelligenceConverter(DocumentConverter):
    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
@ -1339,7 +1342,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
-            endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
+            endpoint=self.endpoint,
            api_version=self.api_version,
            credential=DefaultAzureCredential(),
        )
    def convert(
@ -1347,7 +1352,19 @@ class DocumentIntelligenceConverter(DocumentConverter):
    ) -> Union[None, DocumentConverterResult]:
        # Bail if extension is not supported by Document Intelligence
        extension = kwargs.get("file_extension", "")
-        docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
+        docintel_extensions = [
            ".pdf",
            ".docx",
            ".xlsx",
            ".pptx",
            ".html",
            ".jpeg",
            ".jpg",
            ".png",
            ".bmp",
            ".tiff",
            ".heif",
        ]
        if extension.lower() not in docintel_extensions:
            return None
@ -1362,15 +1379,15 @@ class DocumentIntelligenceConverter(DocumentConverter):
            analysis_features = [
                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
-                DocumentAnalysisFeature.STYLE_FONT  # enable font style extraction
+                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
            ]
-        
+
        # Extract the text using Azure Document Intelligence
        poller = self.doc_intel_client.begin_analyze_document(
            model_id="prebuilt-layout",
            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
            features=analysis_features,
-            output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
        )
        result: AnalyzeResult = poller.result()
@ -1381,6 +1398,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
            text_content=markdown_text,
        )
 class FileConversionException(BaseException):
    pass
@ -1472,7 +1490,9 @@ class MarkItDown:
        # Register Document Intelligence converter at the top of the stack if endpoint is provided
        if docintel_endpoint is not None:
-            self.register_page_converter(DocumentIntelligenceConverter(endpoint=docintel_endpoint))
+            self.register_page_converter(
                DocumentIntelligenceConverter(endpoint=docintel_endpoint)
            )
    def convert(
        self, source: Union[str, requests.Response, Path], **kwargs: Any