This commit is contained in:
Alex 2024-12-15 23:57:00 +00:00 committed by GitHub
commit 5cbd3ceb6e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 18 additions and 2 deletions

View file

@ -2,10 +2,16 @@
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException from ._markitdown import (
MarkItDown,
FileConversionException,
UnsupportedFormatException,
FileDoesNotExistException,
)
__all__ = [ __all__ = [
"MarkItDown", "MarkItDown",
"FileConversionException", "FileConversionException",
"FileDoesNotExistException",
"UnsupportedFormatException", "UnsupportedFormatException",
] ]

View file

@ -845,6 +845,10 @@ class UnsupportedFormatException(BaseException):
pass pass
class FileDoesNotExistException(BaseException):
pass
class MarkItDown: class MarkItDown:
"""(In preview) An extremely simple text-based document reader, suitable for LLM use. """(In preview) An extremely simple text-based document reader, suitable for LLM use.
This reader will convert common file-types or webpages to Markdown.""" This reader will convert common file-types or webpages to Markdown."""
@ -911,6 +915,9 @@ class MarkItDown:
ext = kwargs.get("file_extension") ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else [] extensions = [ext] if ext is not None else []
if not os.path.exists(path):
raise FileDoesNotExistException(f"File {path} does not exist")
# Get extension alternatives from the path and puremagic # Get extension alternatives from the path and puremagic
base, ext = os.path.splitext(path) base, ext = os.path.splitext(path)
self._append_ext(extensions, ext) self._append_ext(extensions, ext)

View file

@ -6,7 +6,7 @@ import shutil
import pytest import pytest
import requests import requests
from markitdown import MarkItDown from markitdown import MarkItDown, FileDoesNotExistException
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
@ -144,6 +144,9 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
with pytest.raises(FileDoesNotExistException):
markitdown.convert(os.path.join(TEST_FILES_DIR, "missing_file.pdf"))
# Test Wikipedia processing # Test Wikipedia processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL