formatting changes

This commit is contained in:
Kenny Zhang 2025-01-23 17:47:30 -05:00
parent 1310bd48ad
commit 46c4890bb4
2 changed files with 31 additions and 8 deletions

View file

@ -73,7 +73,9 @@ def main():
if args.use_docintel: if args.use_docintel:
if args.endpoint is None: if args.endpoint is None:
raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.") raise ValueError(
"Document Intelligence Endpoint is required when using Document Intelligence."
)
elif args.filename is None: elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.") raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(docintel_endpoint=args.endpoint) markitdown = MarkItDown(docintel_endpoint=args.endpoint)
@ -87,6 +89,7 @@ def main():
_handle_output(args, result) _handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult): def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file""" """Handle output to stdout or file"""
if args.output: if args.output:

View file

@ -41,6 +41,7 @@ from azure.ai.documentintelligence.models import (
DocumentAnalysisFeature, DocumentAnalysisFeature,
) )
from azure.identity import DefaultAzureCredential from azure.identity import DefaultAzureCredential
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved. # This constant is a temporary fix until the bug is resolved.
CONTENT_FORMAT = "markdown" CONTENT_FORMAT = "markdown"
@ -170,6 +171,7 @@ class DocumentConverter:
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
raise NotImplementedError() raise NotImplementedError()
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
@ -1328,6 +1330,7 @@ class ZipConverter(DocumentConverter):
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
) )
class DocumentIntelligenceConverter(DocumentConverter): class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
@ -1339,7 +1342,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
self.endpoint = endpoint self.endpoint = endpoint
self.api_version = api_version self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient( self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential() endpoint=self.endpoint,
api_version=self.api_version,
credential=DefaultAzureCredential(),
) )
def convert( def convert(
@ -1347,7 +1352,19 @@ class DocumentIntelligenceConverter(DocumentConverter):
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence # Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"] docintel_extensions = [
".pdf",
".docx",
".xlsx",
".pptx",
".html",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
if extension.lower() not in docintel_extensions: if extension.lower() not in docintel_extensions:
return None return None
@ -1362,7 +1379,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
analysis_features = [ analysis_features = [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT # enable font style extraction DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
] ]
# Extract the text using Azure Document Intelligence # Extract the text using Azure Document Intelligence
@ -1381,6 +1398,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
text_content=markdown_text, text_content=markdown_text,
) )
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@ -1472,7 +1490,9 @@ class MarkItDown:
# Register Document Intelligence converter at the top of the stack if endpoint is provided # Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None: if docintel_endpoint is not None:
self.register_page_converter(DocumentIntelligenceConverter(endpoint=docintel_endpoint)) self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any