formatting changes
This commit is contained in:
parent
1310bd48ad
commit
46c4890bb4
2 changed files with 31 additions and 8 deletions
|
|
@ -73,7 +73,9 @@ def main():
|
|||
|
||||
if args.use_docintel:
|
||||
if args.endpoint is None:
|
||||
raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.")
|
||||
raise ValueError(
|
||||
"Document Intelligence Endpoint is required when using Document Intelligence."
|
||||
)
|
||||
elif args.filename is None:
|
||||
raise ValueError("Filename is required when using Document Intelligence.")
|
||||
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
|
||||
|
|
@ -87,6 +89,7 @@ def main():
|
|||
|
||||
_handle_output(args, result)
|
||||
|
||||
|
||||
def _handle_output(args, result: DocumentConverterResult):
|
||||
"""Handle output to stdout or file"""
|
||||
if args.output:
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ from azure.ai.documentintelligence.models import (
|
|||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.identity import DefaultAzureCredential
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
# This constant is a temporary fix until the bug is resolved.
|
||||
CONTENT_FORMAT = "markdown"
|
||||
|
|
@ -170,6 +171,7 @@ class DocumentConverter:
|
|||
) -> Union[None, DocumentConverterResult]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
|
|
@ -1327,7 +1329,8 @@ class ZipConverter(DocumentConverter):
|
|||
title=None,
|
||||
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
|
||||
class DocumentIntelligenceConverter(DocumentConverter):
|
||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||
|
||||
|
|
@ -1339,7 +1342,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.doc_intel_client = DocumentIntelligenceClient(
|
||||
endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
|
||||
endpoint=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
credential=DefaultAzureCredential(),
|
||||
)
|
||||
|
||||
def convert(
|
||||
|
|
@ -1347,7 +1352,19 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if extension is not supported by Document Intelligence
|
||||
extension = kwargs.get("file_extension", "")
|
||||
docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
|
||||
docintel_extensions = [
|
||||
".pdf",
|
||||
".docx",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
".html",
|
||||
".jpeg",
|
||||
".jpg",
|
||||
".png",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".heif",
|
||||
]
|
||||
if extension.lower() not in docintel_extensions:
|
||||
return None
|
||||
|
||||
|
|
@ -1362,15 +1379,15 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||
analysis_features = [
|
||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||
DocumentAnalysisFeature.STYLE_FONT # enable font style extraction
|
||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||
]
|
||||
|
||||
|
||||
# Extract the text using Azure Document Intelligence
|
||||
poller = self.doc_intel_client.begin_analyze_document(
|
||||
model_id="prebuilt-layout",
|
||||
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
||||
features=analysis_features,
|
||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||
)
|
||||
result: AnalyzeResult = poller.result()
|
||||
|
||||
|
|
@ -1381,6 +1398,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||
text_content=markdown_text,
|
||||
)
|
||||
|
||||
|
||||
class FileConversionException(BaseException):
|
||||
pass
|
||||
|
||||
|
|
@ -1472,7 +1490,9 @@ class MarkItDown:
|
|||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
if docintel_endpoint is not None:
|
||||
self.register_page_converter(DocumentIntelligenceConverter(endpoint=docintel_endpoint))
|
||||
self.register_page_converter(
|
||||
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
||||
)
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
|
|
|
|||
Loading…
Reference in a new issue