From 43bd79adc98552ab994360940d207e32c8a59466 Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 28 Feb 2025 16:07:47 -0800 Subject: [PATCH] Print and log better exceptions when file conversions fail. (#1080) * Print and log better exceptions when file conversions fail. * Added unit tests for exceptions. --- .../markitdown/src/markitdown/__init__.py | 2 + .../markitdown/src/markitdown/_exceptions.py | 46 ++++++++++++++---- .../markitdown/src/markitdown/_markitdown.py | 20 +++++--- .../markitdown/tests/test_files/random.bin | Bin 0 -> 1024 bytes packages/markitdown/tests/test_markitdown.py | 18 ++++++- 5 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 packages/markitdown/tests/test_files/random.bin diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index 59d9750..c6d5363 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -7,6 +7,7 @@ from ._markitdown import MarkItDown from ._exceptions import ( MarkItDownException, ConverterPrerequisiteException, + FailedConversionAttempt, FileConversionException, UnsupportedFormatException, ) @@ -19,6 +20,7 @@ __all__ = [ "DocumentConverterResult", "MarkItDownException", "ConverterPrerequisiteException", + "FailedConversionAttempt", "FileConversionException", "UnsupportedFormatException", ] diff --git a/packages/markitdown/src/markitdown/_exceptions.py b/packages/markitdown/src/markitdown/_exceptions.py index 30c4dc5..50d2496 100644 --- a/packages/markitdown/src/markitdown/_exceptions.py +++ b/packages/markitdown/src/markitdown/_exceptions.py @@ -1,3 +1,6 @@ +from typing import Optional, List, Any + + class MarkItDownException(BaseException): """ Base exception class for MarkItDown. @@ -20,18 +23,43 @@ class ConverterPrerequisiteException(MarkItDownException): pass -class FileConversionException(MarkItDownException): - """ - Thrown when a suitable converter was found, but the conversion - process fails for any reason. - """ - - pass - - class UnsupportedFormatException(MarkItDownException): """ Thrown when no suitable converter was found for the given file. """ pass + + +class FailedConversionAttempt(object): + """ + Represents an a single attempt to convert a file. + """ + + def __init__(self, converter: Any, exc_info: Optional[tuple] = None): + self.converter = converter + self.exc_info = exc_info + + +class FileConversionException(MarkItDownException): + """ + Thrown when a suitable converter was found, but the conversion + process fails for any reason. + """ + + def __init__( + self, + message: Optional[str] = None, + attempts: Optional[List[FailedConversionAttempt]] = None, + ): + self.attempts = attempts + + if message is None: + if attempts is None: + message = "File conversion failed." + else: + message = f"File conversion failed after {len(attempts)} attempts:\n" + for attempt in attempts: + message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n" + + super().__init__(message) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 51f5c33..49b817d 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -2,6 +2,7 @@ import copy import mimetypes import os import re +import sys import tempfile import warnings import traceback @@ -42,6 +43,7 @@ from ._exceptions import ( FileConversionException, UnsupportedFormatException, ConverterPrerequisiteException, + FailedConversionAttempt, ) # Override mimetype for csv to fix issue on windows @@ -313,7 +315,9 @@ class MarkItDown: self, local_path: str, extensions: List[Union[str, None]], **kwargs ) -> DocumentConverterResult: res: Union[None, DocumentConverterResult] = None - error_trace = "" + + # Keep track of which converters throw exceptions + failed_attempts: List[FailedConversionAttempt] = [] # Create a copy of the page_converters list, sorted by priority. # We do this with each call to _convert because the priority of converters may change between calls. @@ -351,7 +355,11 @@ class MarkItDown: try: res = converter.convert(local_path, **_kwargs) except Exception: - error_trace = ("\n\n" + traceback.format_exc()).strip() + failed_attempts.append( + FailedConversionAttempt( + converter=converter, exc_info=sys.exc_info() + ) + ) if res is not None: # Normalize the content @@ -364,14 +372,12 @@ class MarkItDown: return res # If we got this far without success, report any exceptions - if len(error_trace) > 0: - raise FileConversionException( - f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}" - ) + if len(failed_attempts) > 0: + raise FileConversionException(attempts=failed_attempts) # Nothing can handle it! raise UnsupportedFormatException( - f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported." + f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." ) def _append_ext(self, extensions, ext): diff --git a/packages/markitdown/tests/test_files/random.bin b/packages/markitdown/tests/test_files/random.bin new file mode 100644 index 0000000000000000000000000000000000000000..e673935e3fe247eb568b532edda79cfa6d7cfd23 GIT binary patch literal 1024 zcmV+b1poVZqDUF|I@MPm@I2ba$0^0xsbp5=o$th>YH+h(LnR-*n?R?s5~^bsp5l$X z7!7%l(k=#N0Jw)Mg|7LqVl#M>YgXS>F_7qT+Ta$BOcL@<=D?8&Tyzbza0M!}1$;|W zE280En|hWp5mqOnj+8dseB-OIb=`)1NzxJ`L~!o}I7|^;+VBtH0Eryk+!aOZQtBbu z+d)OC*Sj!C8hXD<05H0AteX&eRn;m*M-^4a)GnCF&Oy_Xtl2HYT$Q4Cl5VSs&Ep6m zBH}pJ@5V)56TBb@{sw}78^asVLNQvD&&bza?s=)<`Aum^Nwp9aYfaNz@n1(ZtEb3T zn1_qrM48Xdk_Sp=F9@=dlt0+n(CwH7(8{%|5AU1C`0=y}c8wJ(VA?k|Fxn_-03S<% z$$npfgt!B__+tzm3hvzTwmBh^;BS#Xmy#m!uK{z{@z~TV;a{SJXCFB0G0>@izrxvx zLDXFsvXHb$p}6ONcFotcO_yhBU>g^5`itQOtrNzb33~pIm^}+Nu)R^|`okXdo2=yR zql__p$hJ^g1zP)QpcN+YcA2?NB5#bK&KyY*C&eZHEFLGUm&Zeypx-Y>RQV_7eJE?j{;Fovw#+-GF2oxJtV}EyPNi0fN&@c zy$8z^dxzdPjCj{s>*ZM6XASd9ew9-p$NFR)ABLs}(MEP|*<4wZP9tC-Dmu2BTG{ez zM9(1*o!NO|rTs52NkHUb8t^@X;Gb=?g5W+B@PHq)d9}wi50J=^)9pDfUine~u}mW@EAyzV60qXX@FPW%u??=}_W#YoyH!tqHeq^X1rTURVlV uZ!;^~(Qhc=?1Mm6xcY{dXP^%X3G4bAv$;g8TUZ literal 0 HcmV?d00001 diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index 55afcc3..0a3b56e 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -8,7 +8,7 @@ import requests from warnings import catch_warnings, resetwarnings -from markitdown import MarkItDown +from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException skip_remote = ( True if os.environ.get("GITHUB_ACTIONS") else False @@ -272,6 +272,21 @@ def test_markitdown_local() -> None: assert "# Test" in result.text_content +def test_exceptions() -> None: + # Check that an exception is raised when trying to convert an unsupported format + markitdown = MarkItDown() + with pytest.raises(UnsupportedFormatException): + markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin")) + + # Check that an exception is raised when trying to convert a file that is corrupted + with pytest.raises(FileConversionException) as exc_info: + markitdown.convert( + os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx" + ) + assert len(exc_info.value.attempts) == 1 + assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", @@ -329,6 +344,7 @@ if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() test_markitdown_local() + test_exceptions() test_markitdown_exiftool() # test_markitdown_llm() print("All tests passed!")