ran tests for docintel and offline for many filetypes
This commit is contained in:
parent
b211ddbe82
commit
9230300100
1 changed files with 15 additions and 13 deletions
|
|
@ -1352,7 +1352,8 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Get the bytestring for the local path
|
# Get the bytestring for the local path
|
||||||
file_bytes = open(local_path, "rb").read()
|
with open(local_path, "rb") as f:
|
||||||
|
file_bytes = f.read()
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
|
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html"]:
|
if extension.lower() in [".xlsx", ".pptx", ".html"]:
|
||||||
|
|
@ -1367,7 +1368,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
# Extract the text using Azure Document Intelligence
|
# Extract the text using Azure Document Intelligence
|
||||||
poller = self.doc_intel_client.begin_analyze_document(
|
poller = self.doc_intel_client.begin_analyze_document(
|
||||||
model_id="prebuilt-layout",
|
model_id="prebuilt-layout",
|
||||||
analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
||||||
features=analysis_features,
|
features=analysis_features,
|
||||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||||
)
|
)
|
||||||
|
|
@ -1604,6 +1605,7 @@ class MarkItDown:
|
||||||
# Convert
|
# Convert
|
||||||
if self._docintel_converter is not None:
|
if self._docintel_converter is not None:
|
||||||
result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
|
result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
|
||||||
|
else:
|
||||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
||||||
# Clean up
|
# Clean up
|
||||||
finally:
|
finally:
|
||||||
|
|
@ -1690,8 +1692,6 @@ class MarkItDown:
|
||||||
# If we hit an error log it and keep trying
|
# If we hit an error log it and keep trying
|
||||||
try:
|
try:
|
||||||
res = self._docintel_converter.convert(local_path, **_kwargs)
|
res = self._docintel_converter.convert(local_path, **_kwargs)
|
||||||
except Exception:
|
|
||||||
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
|
|
@ -1702,6 +1702,8 @@ class MarkItDown:
|
||||||
|
|
||||||
# Todo
|
# Todo
|
||||||
return res
|
return res
|
||||||
|
except Exception:
|
||||||
|
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||||
|
|
||||||
# If we got this far without success, report any exceptions
|
# If we got this far without success, report any exceptions
|
||||||
if len(error_trace) > 0:
|
if len(error_trace) > 0:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue