ran tests for docintel and offline for many filetypes

2025-01-10 14:11:48 -05:00 · 2025-01-10 14:11:48 -05:00 · 9230300100
commit 9230300100
parent b211ddbe82
1 changed files with 15 additions and 13 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -1352,7 +1352,8 @@ class DocumentIntelligenceConverter(DocumentConverter):
            return None
        # Get the bytestring for the local path
-        file_bytes = open(local_path, "rb").read()
+        with open(local_path, "rb") as f:
            file_bytes = f.read()
        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
        if extension.lower() in [".xlsx", ".pptx", ".html"]:
@ -1367,7 +1368,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
        # Extract the text using Azure Document Intelligence
        poller = self.doc_intel_client.begin_analyze_document(
            model_id="prebuilt-layout",
-            analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
            features=analysis_features,
            output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
        )
@ -1604,6 +1605,7 @@ class MarkItDown:
            # Convert
            if self._docintel_converter is not None:
                result = self._convert_docintel(temp_path, extensions, url=response.url, **kwargs)
            else:
                result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
@ -1690,8 +1692,6 @@ class MarkItDown:
             # If we hit an error log it and keep trying
            try:
                res = self._docintel_converter.convert(local_path, **_kwargs)
            except Exception:
                error_trace = ("\n\n" + traceback.format_exc()).strip()
                if res is not None:
                    # Normalize the content
@ -1702,6 +1702,8 @@ class MarkItDown:
                    # Todo
                    return res
            except Exception:
                error_trace = ("\n\n" + traceback.format_exc()).strip()
        # If we got this far without success, report any exceptions
        if len(error_trace) > 0: