ran tests for docintel and offline for many filetypes

This commit is contained in:
Kenny Zhang 2025-01-10 14:11:48 -05:00
parent b211ddbe82
commit 9230300100

View file

@ -1352,7 +1352,8 @@ class DocumentIntelligenceConverter(DocumentConverter):
return None return None
# Get the bytestring for the local path # Get the bytestring for the local path
file_bytes = open(local_path, "rb").read() with open(local_path, "rb") as f:
file_bytes = f.read()
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
if extension.lower() in [".xlsx", ".pptx", ".html"]: if extension.lower() in [".xlsx", ".pptx", ".html"]:
@ -1367,7 +1368,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
# Extract the text using Azure Document Intelligence # Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document( poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout", model_id="prebuilt-layout",
analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes), body=AnalyzeDocumentRequest(bytes_source=file_bytes),
features=analysis_features, features=analysis_features,
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
) )
@ -1604,6 +1605,7 @@ class MarkItDown:
# Convert # Convert
if self._docintel_converter is not None: if self._docintel_converter is not None:
result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs) result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
else:
result = self._convert(temp_path, extensions, url=response.url, **kwargs) result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
@ -1690,8 +1692,6 @@ class MarkItDown:
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
try: try:
res = self._docintel_converter.convert(local_path, **_kwargs) res = self._docintel_converter.convert(local_path, **_kwargs)
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()
if res is not None: if res is not None:
# Normalize the content # Normalize the content
@ -1702,6 +1702,8 @@ class MarkItDown:
# Todo # Todo
return res return res
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()
# If we got this far without success, report any exceptions # If we got this far without success, report any exceptions
if len(error_trace) > 0: if len(error_trace) > 0: