formatting changes
This commit is contained in:
parent
1310bd48ad
commit
46c4890bb4
2 changed files with 31 additions and 8 deletions
|
|
@ -73,7 +73,9 @@ def main():
|
||||||
|
|
||||||
if args.use_docintel:
|
if args.use_docintel:
|
||||||
if args.endpoint is None:
|
if args.endpoint is None:
|
||||||
raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.")
|
raise ValueError(
|
||||||
|
"Document Intelligence Endpoint is required when using Document Intelligence."
|
||||||
|
)
|
||||||
elif args.filename is None:
|
elif args.filename is None:
|
||||||
raise ValueError("Filename is required when using Document Intelligence.")
|
raise ValueError("Filename is required when using Document Intelligence.")
|
||||||
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
|
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
|
||||||
|
|
@ -87,6 +89,7 @@ def main():
|
||||||
|
|
||||||
_handle_output(args, result)
|
_handle_output(args, result)
|
||||||
|
|
||||||
|
|
||||||
def _handle_output(args, result: DocumentConverterResult):
|
def _handle_output(args, result: DocumentConverterResult):
|
||||||
"""Handle output to stdout or file"""
|
"""Handle output to stdout or file"""
|
||||||
if args.output:
|
if args.output:
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ from azure.ai.documentintelligence.models import (
|
||||||
DocumentAnalysisFeature,
|
DocumentAnalysisFeature,
|
||||||
)
|
)
|
||||||
from azure.identity import DefaultAzureCredential
|
from azure.identity import DefaultAzureCredential
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
# This constant is a temporary fix until the bug is resolved.
|
# This constant is a temporary fix until the bug is resolved.
|
||||||
CONTENT_FORMAT = "markdown"
|
CONTENT_FORMAT = "markdown"
|
||||||
|
|
@ -170,6 +171,7 @@ class DocumentConverter:
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
|
|
@ -1327,7 +1329,8 @@ class ZipConverter(DocumentConverter):
|
||||||
title=None,
|
title=None,
|
||||||
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class DocumentIntelligenceConverter(DocumentConverter):
|
class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||||
|
|
||||||
|
|
@ -1339,7 +1342,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.api_version = api_version
|
self.api_version = api_version
|
||||||
self.doc_intel_client = DocumentIntelligenceClient(
|
self.doc_intel_client = DocumentIntelligenceClient(
|
||||||
endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
|
endpoint=self.endpoint,
|
||||||
|
api_version=self.api_version,
|
||||||
|
credential=DefaultAzureCredential(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
|
|
@ -1347,7 +1352,19 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if extension is not supported by Document Intelligence
|
# Bail if extension is not supported by Document Intelligence
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
|
docintel_extensions = [
|
||||||
|
".pdf",
|
||||||
|
".docx",
|
||||||
|
".xlsx",
|
||||||
|
".pptx",
|
||||||
|
".html",
|
||||||
|
".jpeg",
|
||||||
|
".jpg",
|
||||||
|
".png",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".heif",
|
||||||
|
]
|
||||||
if extension.lower() not in docintel_extensions:
|
if extension.lower() not in docintel_extensions:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -1362,15 +1379,15 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
analysis_features = [
|
analysis_features = [
|
||||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||||
DocumentAnalysisFeature.STYLE_FONT # enable font style extraction
|
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||||
]
|
]
|
||||||
|
|
||||||
# Extract the text using Azure Document Intelligence
|
# Extract the text using Azure Document Intelligence
|
||||||
poller = self.doc_intel_client.begin_analyze_document(
|
poller = self.doc_intel_client.begin_analyze_document(
|
||||||
model_id="prebuilt-layout",
|
model_id="prebuilt-layout",
|
||||||
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
||||||
features=analysis_features,
|
features=analysis_features,
|
||||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||||
)
|
)
|
||||||
result: AnalyzeResult = poller.result()
|
result: AnalyzeResult = poller.result()
|
||||||
|
|
||||||
|
|
@ -1381,6 +1398,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
text_content=markdown_text,
|
text_content=markdown_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class FileConversionException(BaseException):
|
class FileConversionException(BaseException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
@ -1472,7 +1490,9 @@ class MarkItDown:
|
||||||
|
|
||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
if docintel_endpoint is not None:
|
if docintel_endpoint is not None:
|
||||||
self.register_page_converter(DocumentIntelligenceConverter(endpoint=docintel_endpoint))
|
self.register_page_converter(
|
||||||
|
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
||||||
|
)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue