diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d83424d..6370fa7 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -13,7 +13,7 @@ import sys import tempfile import traceback import zipfile -from typing import Any, Dict, List, Optional, Union, overload +from typing import Any, Dict, List, Optional, Union from pathlib import Path from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import catch_warnings @@ -1047,7 +1047,7 @@ class MarkItDown: ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: - - source: can be a string representing a path or url, or a requests.response object + - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) """ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 3cc56e1..35ad35d 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -230,105 +230,8 @@ def test_markitdown_exiftool() -> None: assert target in result.text_content - -def test_markitdown_local_pathlib() -> None: - markitdown = MarkItDown() - - # Test XLSX processing - result = markitdown.convert(Path(TEST_FILES_DIR) / "test.xlsx") - for test_string in XLSX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing - result = markitdown.convert(Path(TEST_FILES_DIR) / "test.docx") - for test_string in DOCX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing, with comments - result = markitdown.convert( - Path(TEST_FILES_DIR) / "test_with_comment.docx", - style_map="comment-reference => ", - ) - for test_string in DOCX_COMMENT_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing, with comments and setting style_map on init - markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") - result = markitdown_with_style_map.convert( - Path(TEST_FILES_DIR) / "test_with_comment.docx" - ) - for test_string in DOCX_COMMENT_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test PPTX processing - result = markitdown.convert(Path(TEST_FILES_DIR) / "test.pptx") - for test_string in PPTX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test HTML processing - result = markitdown.convert( - Path(TEST_FILES_DIR) / "test_blog.html", url=BLOG_TEST_URL - ) - for test_string in BLOG_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test ZIP file processing - result = markitdown.convert(Path(TEST_FILES_DIR) / "test_files.zip") - for test_string in DOCX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test Wikipedia processing - result = markitdown.convert( - Path(TEST_FILES_DIR) / "test_wikipedia.html", url=WIKIPEDIA_TEST_URL - ) - text_content = result.text_content.replace("\\", "") - for test_string in WIKIPEDIA_TEST_EXCLUDES: - assert test_string not in text_content - for test_string in WIKIPEDIA_TEST_STRINGS: - assert test_string in text_content - - # Test Bing processing - result = markitdown.convert( - Path(TEST_FILES_DIR) / "test_serp.html", url=SERP_TEST_URL - ) - text_content = result.text_content.replace("\\", "") - for test_string in SERP_TEST_EXCLUDES: - assert test_string not in text_content - for test_string in SERP_TEST_STRINGS: - assert test_string in text_content - - # Test non-UTF-8 encoding - result = markitdown.convert(Path(TEST_FILES_DIR) / "test_mskanji.csv") - text_content = result.text_content.replace("\\", "") - for test_string in CSV_CP932_TEST_STRINGS: - assert test_string in text_content - - -@pytest.mark.skipif( - skip_exiftool, - reason="do not run if exiftool is not installed", -) -def test_markitdown_exiftool_pathlib() -> None: - markitdown = MarkItDown() - - # Test JPG metadata processing - result = markitdown.convert(Path(TEST_FILES_DIR) / "test.jpg") - for key in JPG_TEST_EXIFTOOL: - target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" - assert target in result.text_content - - if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() test_markitdown_local() test_markitdown_exiftool() - test_markitdown_local_pathlib() - test_markitdown_exiftool_pathlib() \ No newline at end of file