formatting changes

This commit is contained in:
Kenny Zhang 2025-01-23 17:47:30 -05:00
parent 1310bd48ad
commit 46c4890bb4
2 changed files with 31 additions and 8 deletions

View file

@ -73,7 +73,9 @@ def main():
if args.use_docintel:
if args.endpoint is None:
raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.")
raise ValueError(
"Document Intelligence Endpoint is required when using Document Intelligence."
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
@ -87,6 +89,7 @@ def main():
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:

View file

@ -41,6 +41,7 @@ from azure.ai.documentintelligence.models import (
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.
CONTENT_FORMAT = "markdown"
@ -170,6 +171,7 @@ class DocumentConverter:
) -> Union[None, DocumentConverterResult]:
raise NotImplementedError()
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
@ -1328,6 +1330,7 @@ class ZipConverter(DocumentConverter):
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)
class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
@ -1339,7 +1342,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
endpoint=self.endpoint,
api_version=self.api_version,
credential=DefaultAzureCredential(),
)
def convert(
@ -1347,7 +1352,19 @@ class DocumentIntelligenceConverter(DocumentConverter):
) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "")
docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
docintel_extensions = [
".pdf",
".docx",
".xlsx",
".pptx",
".html",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
if extension.lower() not in docintel_extensions:
return None
@ -1362,7 +1379,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
analysis_features = [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT # enable font style extraction
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
]
# Extract the text using Azure Document Intelligence
@ -1370,7 +1387,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
features=analysis_features,
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
)
result: AnalyzeResult = poller.result()
@ -1381,6 +1398,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
text_content=markdown_text,
)
class FileConversionException(BaseException):
pass
@ -1472,7 +1490,9 @@ class MarkItDown:
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:
self.register_page_converter(DocumentIntelligenceConverter(endpoint=docintel_endpoint))
self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any