From 9230300100d8e86fe2bc11d0dfce339d1d787a48 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Fri, 10 Jan 2025 14:11:48 -0500 Subject: [PATCH] ran tests for docintel and offline for many filetypes --- src/markitdown/_markitdown.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 347dd64..5dcf9b8 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1352,7 +1352,8 @@ class DocumentIntelligenceConverter(DocumentConverter): return None # Get the bytestring for the local path - file_bytes = open(local_path, "rb").read() + with open(local_path, "rb") as f: + file_bytes = f.read() # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) if extension.lower() in [".xlsx", ".pptx", ".html"]: @@ -1367,7 +1368,7 @@ class DocumentIntelligenceConverter(DocumentConverter): # Extract the text using Azure Document Intelligence poller = self.doc_intel_client.begin_analyze_document( model_id="prebuilt-layout", - analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes), + body=AnalyzeDocumentRequest(bytes_source=file_bytes), features=analysis_features, output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed ) @@ -1604,7 +1605,8 @@ class MarkItDown: # Convert if self._docintel_converter is not None: result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs) - result = self._convert(temp_path, extensions, url=response.url, **kwargs) + else: + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: @@ -1690,18 +1692,18 @@ class MarkItDown: # If we hit an error log it and keep trying try: res = self._docintel_converter.convert(local_path, **_kwargs) + + if res is not None: + # Normalize the content + res.text_content = "\n".join( + [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] + ) + res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) + + # Todo + return res except Exception: error_trace = ("\n\n" + traceback.format_exc()).strip() - - if res is not None: - # Normalize the content - res.text_content = "\n".join( - [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] - ) - res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) - - # Todo - return res # If we got this far without success, report any exceptions if len(error_trace) > 0: