added isolated doc_intel main conversion function
This commit is contained in:
parent
62a0d6c082
commit
811e4413aa
1 changed files with 98 additions and 51 deletions
|
|
@ -168,54 +168,6 @@ class DocumentConverter:
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
class DocumentIntelligenceConverter(DocumentConverter):
|
|
||||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
endpoint: str,
|
|
||||||
api_version: str = "2024-07-31-preview",
|
|
||||||
):
|
|
||||||
self.endpoint = endpoint
|
|
||||||
self.api_version = api_version
|
|
||||||
self.doc_intel_client = DocumentIntelligenceClient(
|
|
||||||
endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
|
|
||||||
)
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
|
|
||||||
# Get the bytestring for the local path
|
|
||||||
file_bytes = open(local_path, "rb").read()
|
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
|
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html"]:
|
|
||||||
analysis_features = []
|
|
||||||
else:
|
|
||||||
analysis_features = [
|
|
||||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
|
||||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
|
||||||
DocumentAnalysisFeature.STYLE_FONT # enable font style extraction
|
|
||||||
]
|
|
||||||
|
|
||||||
# Extract the text using Azure Document Intelligence
|
|
||||||
poller = self.doc_intel_client.begin_analyze_document(
|
|
||||||
model_id="prebuilt-layout",
|
|
||||||
analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
|
||||||
features=analysis_features,
|
|
||||||
output_content_format=ContentFormat.MARKDOWN,
|
|
||||||
)
|
|
||||||
result: AnalyzeResult = poller.result()
|
|
||||||
|
|
||||||
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
|
||||||
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=markdown_text,
|
|
||||||
)
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
|
|
@ -1374,6 +1326,57 @@ class ZipConverter(DocumentConverter):
|
||||||
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
|
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
endpoint: str,
|
||||||
|
api_version: str = "2024-07-31-preview",
|
||||||
|
):
|
||||||
|
self.endpoint = endpoint
|
||||||
|
self.api_version = api_version
|
||||||
|
self.doc_intel_client = DocumentIntelligenceClient(
|
||||||
|
endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential()
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if extension is not supported by Document Intelligence
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
docintel_extensions = [".pdf", ".docx", ".xlsx", ".pptx", ".html", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif"]
|
||||||
|
if extension.lower() not in docintel_extensions:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get the bytestring for the local path
|
||||||
|
file_bytes = open(local_path, "rb").read()
|
||||||
|
|
||||||
|
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
|
||||||
|
if extension.lower() in [".xlsx", ".pptx", ".html"]:
|
||||||
|
analysis_features = []
|
||||||
|
else:
|
||||||
|
analysis_features = [
|
||||||
|
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||||
|
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||||
|
DocumentAnalysisFeature.STYLE_FONT # enable font style extraction
|
||||||
|
]
|
||||||
|
|
||||||
|
# Extract the text using Azure Document Intelligence
|
||||||
|
poller = self.doc_intel_client.begin_analyze_document(
|
||||||
|
model_id="prebuilt-layout",
|
||||||
|
analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
||||||
|
features=analysis_features,
|
||||||
|
output_content_format=ContentFormat.MARKDOWN,
|
||||||
|
)
|
||||||
|
result: AnalyzeResult = poller.result()
|
||||||
|
|
||||||
|
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
||||||
|
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=markdown_text,
|
||||||
|
)
|
||||||
|
|
||||||
class FileConversionException(BaseException):
|
class FileConversionException(BaseException):
|
||||||
pass
|
pass
|
||||||
|
|
@ -1441,9 +1444,8 @@ class MarkItDown:
|
||||||
self._style_map = style_map
|
self._style_map = style_map
|
||||||
self._exiftool_path = exiftool_path
|
self._exiftool_path = exiftool_path
|
||||||
|
|
||||||
self._docintel_endpoint = docintel_endpoint
|
if docintel_endpoint is not None:
|
||||||
if self._docintel_endpoint is not None:
|
self._docintel_converter = DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
||||||
self._docintel_client = DocumentIntelligenceConverter(endpoint=self._docintel_endpoint)
|
|
||||||
|
|
||||||
self._page_converters: List[DocumentConverter] = []
|
self._page_converters: List[DocumentConverter] = []
|
||||||
|
|
||||||
|
|
@ -1510,6 +1512,8 @@ class MarkItDown:
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
|
if self._docintel_converter is not None:
|
||||||
|
return self._convert_docintel(path, extensions, **kwargs)
|
||||||
return self._convert(path, extensions, **kwargs)
|
return self._convert(path, extensions, **kwargs)
|
||||||
|
|
||||||
# TODO what should stream's type be?
|
# TODO what should stream's type be?
|
||||||
|
|
@ -1594,6 +1598,8 @@ class MarkItDown:
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
|
if self._docintel_converter is not None:
|
||||||
|
result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
|
||||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
||||||
# Clean up
|
# Clean up
|
||||||
finally:
|
finally:
|
||||||
|
|
@ -1663,6 +1669,47 @@ class MarkItDown:
|
||||||
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _convert_docintel(
|
||||||
|
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
error_trace = ""
|
||||||
|
for ext in extensions + [None]: # Try last with no extension
|
||||||
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
|
# Overwrite file_extension appropriately
|
||||||
|
if ext is None:
|
||||||
|
if "file_extension" in _kwargs:
|
||||||
|
del _kwargs["file_extension"]
|
||||||
|
else:
|
||||||
|
_kwargs.update({"file_extension": ext})
|
||||||
|
|
||||||
|
# If we hit an error log it and keep trying
|
||||||
|
try:
|
||||||
|
res = self._docintel_converter.convert(local_path, **_kwargs)
|
||||||
|
except Exception:
|
||||||
|
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||||
|
|
||||||
|
if res is not None:
|
||||||
|
# Normalize the content
|
||||||
|
res.text_content = "\n".join(
|
||||||
|
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||||
|
)
|
||||||
|
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||||
|
|
||||||
|
# Todo
|
||||||
|
return res
|
||||||
|
|
||||||
|
# If we got this far without success, report any exceptions
|
||||||
|
if len(error_trace) > 0:
|
||||||
|
raise FileConversionException(
|
||||||
|
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extension not supported by Document Intelligence
|
||||||
|
raise UnsupportedFormatException(
|
||||||
|
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported by Document Intelligence."
|
||||||
|
)
|
||||||
|
|
||||||
def _append_ext(self, extensions, ext):
|
def _append_ext(self, extensions, ext):
|
||||||
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
||||||
if ext is None:
|
if ext is None:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue