From 30924f7bb9d84328b794ce00f26ab99c9d13e0ce Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 11 Mar 2025 22:02:47 -0700 Subject: [PATCH] Fixed CI errors, and inluded misc tests. --- packages/markitdown/tests/_test_vectors.py | 48 +-- packages/markitdown/tests/test_cli_vectors.py | 8 +- packages/markitdown/tests/test_module_misc.py | 324 ++++++++++++++++++ .../markitdown/tests/test_module_vectors.py | 17 +- 4 files changed, 345 insertions(+), 52 deletions(-) create mode 100644 packages/markitdown/tests/test_module_misc.py diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index eeb83f1..5d2b2fc 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -3,7 +3,7 @@ from typing import List @dataclasses.dataclass(frozen=True, kw_only=True) -class TestVector(object): +class FileTestVector(object): filename: str mimetype: str | None charset: str | None @@ -13,7 +13,7 @@ class TestVector(object): GENERAL_TEST_VECTORS = [ - TestVector( + FileTestVector( filename="test.docx", mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", charset=None, @@ -28,7 +28,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test.xlsx", mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", charset=None, @@ -40,7 +40,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test.xls", mimetype="application/vnd.ms-excel", charset=None, @@ -52,7 +52,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test.pptx", mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", charset=None, @@ -68,7 +68,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test_outlook_msg.msg", mimetype="application/vnd.ms-outlook", charset=None, @@ -83,7 +83,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test.pdf", mimetype="application/pdf", charset=None, @@ -93,23 +93,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - # TestVector( - # filename='test_with_comment.docx', - # mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document', - # charset=None, - # must_include=[ - # "314b0a30-5b04-470b-b9f7-eed2c2bec74a", - # "49e168b7-d2ae-407f-a055-2167576f39a1", - # "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", - # "# Abstract", - # "# Introduction", - # "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", - # "This is a test comment. 12df-321a", - # "Yet another comment in the doc. 55yiyi-asd09", - # ], - # must_not_include=[] - # ), - TestVector( + FileTestVector( filename="test_blog.html", mimetype="text/html", charset="utf-8", @@ -120,7 +104,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test_wikipedia.html", mimetype="text/html", charset="utf-8", @@ -135,7 +119,7 @@ GENERAL_TEST_VECTORS = [ "move to sidebar", ], ), - TestVector( + FileTestVector( filename="test_serp.html", mimetype="text/html", charset="utf-8", @@ -150,10 +134,10 @@ GENERAL_TEST_VECTORS = [ "data:image/svg+xml,%3Csvg%20width%3D", ], ), - TestVector( + FileTestVector( filename="test_mskanji.csv", mimetype="text/csv", - charset="mskanji", + charset="cp932", url=None, must_include=[ "名前,年齢,住所", @@ -163,7 +147,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test.json", mimetype="application/json", charset="ascii", @@ -174,7 +158,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), - TestVector( + FileTestVector( filename="test_rss.xml", mimetype="text/xml", charset="utf-8", @@ -186,7 +170,7 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[" None: + """Test operations performed on StreamInfo objects.""" + + stream_info_original = StreamInfo( + mimetype="mimetype.1", + extension="extension.1", + charset="charset.1", + filename="filename.1", + local_path="local_path.1", + url="url.1", + ) + + # Check updating all attributes by keyword + keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] + for keyword in keywords: + updated_stream_info = stream_info_original.copy_and_update( + **{keyword: f"{keyword}.2"} + ) + + # Make sure the targted attribute is updated + assert getattr(updated_stream_info, keyword) == f"{keyword}.2" + + # Make sure the other attributes are unchanged + for k in keywords: + if k != keyword: + assert getattr(stream_info_original, k) == getattr( + updated_stream_info, k + ) + + # Check updating all attributes by passing a new StreamInfo object + keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] + for keyword in keywords: + updated_stream_info = stream_info_original.copy_and_update( + StreamInfo(**{keyword: f"{keyword}.2"}) + ) + + # Make sure the targted attribute is updated + assert getattr(updated_stream_info, keyword) == f"{keyword}.2" + + # Make sure the other attributes are unchanged + for k in keywords: + if k != keyword: + assert getattr(stream_info_original, k) == getattr( + updated_stream_info, k + ) + + # Check mixing and matching + updated_stream_info = stream_info_original.copy_and_update( + StreamInfo(extension="extension.2", filename="filename.2"), + mimetype="mimetype.3", + charset="charset.3", + ) + assert updated_stream_info.extension == "extension.2" + assert updated_stream_info.filename == "filename.2" + assert updated_stream_info.mimetype == "mimetype.3" + assert updated_stream_info.charset == "charset.3" + assert updated_stream_info.local_path == "local_path.1" + assert updated_stream_info.url == "url.1" + + # Check multiple StreamInfo objects + updated_stream_info = stream_info_original.copy_and_update( + StreamInfo(extension="extension.4", filename="filename.5"), + StreamInfo(mimetype="mimetype.6", charset="charset.7"), + ) + assert updated_stream_info.extension == "extension.4" + assert updated_stream_info.filename == "filename.5" + assert updated_stream_info.mimetype == "mimetype.6" + assert updated_stream_info.charset == "charset.7" + assert updated_stream_info.local_path == "local_path.1" + assert updated_stream_info.url == "url.1" + + +def test_docx_comments() -> None: + markitdown = MarkItDown() + + # Test DOCX processing, with comments and setting style_map on init + markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") + result = markitdown_with_style_map.convert( + os.path.join(TEST_FILES_DIR, "test_with_comment.docx") + ) + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) + + +def test_input_as_strings() -> None: + markitdown = MarkItDown() + + # Test input from a stream + input_data = b"

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data)) + assert "# Test" in result.text_content + + # Test input with leading blank characters + input_data = b" \n\n\n

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data)) + assert "# Test" in result.text_content + + +@pytest.mark.skipif( + skip_remote, + reason="do not run tests that query external urls", +) +def test_markitdown_remote() -> None: + markitdown = MarkItDown() + + # By URL + result = markitdown.convert(PDF_TEST_URL) + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content + + # Youtube + result = markitdown.convert(YOUTUBE_TEST_URL) + for test_string in YOUTUBE_TEST_STRINGS: + assert test_string in result.text_content + + +@pytest.mark.skipif( + skip_remote, + reason="do not run remotely run speech transcription tests", +) +def test_speech_transcription() -> None: + markitdown = MarkItDown() + + # Test WAV files, MP3 and M4A files + for file_name in ["test.wav", "test.mp3", "test.m4a"]: + result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name)) + result_lower = result.text_content.lower() + assert ( + ("1" in result_lower or "one" in result_lower) + and ("2" in result_lower or "two" in result_lower) + and ("3" in result_lower or "three" in result_lower) + and ("4" in result_lower or "four" in result_lower) + and ("5" in result_lower or "five" in result_lower) + ) + + +def test_exceptions() -> None: + # Check that an exception is raised when trying to convert an unsupported format + markitdown = MarkItDown() + with pytest.raises(UnsupportedFormatException): + markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin")) + + # Check that an exception is raised when trying to convert a file that is corrupted + with pytest.raises(FileConversionException) as exc_info: + markitdown.convert( + os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx" + ) + assert len(exc_info.value.attempts) == 1 + assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" + + +@pytest.mark.skipif( + skip_exiftool, + reason="do not run if exiftool is not installed", +) +def test_markitdown_exiftool() -> None: + which_exiftool = shutil.which("exiftool") + assert which_exiftool is not None + + # Test explicitly setting the location of exiftool + markitdown = MarkItDown(exiftool_path=which_exiftool) + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + for key in JPG_TEST_EXIFTOOL: + target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" + assert target in result.text_content + + # Test setting the exiftool path through an environment variable + os.environ["EXIFTOOL_PATH"] = which_exiftool + markitdown = MarkItDown() + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + for key in JPG_TEST_EXIFTOOL: + target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" + assert target in result.text_content + + # Test some other media types + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3")) + for key in MP3_TEST_EXIFTOOL: + target = f"{key}: {MP3_TEST_EXIFTOOL[key]}" + assert target in result.text_content + + +@pytest.mark.skipif( + skip_llm, + reason="do not run llm tests without a key", +) +def test_markitdown_llm() -> None: + client = openai.OpenAI() + markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") + + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) + for test_string in LLM_TEST_STRINGS: + assert test_string in result.text_content + + # This is not super precise. It would also accept "red square", "blue circle", + # "the square is not blue", etc. But it's sufficient for this test. + for test_string in ["red", "circle", "blue", "square"]: + assert test_string in result.text_content.lower() + + # Images embedded in PPTX files + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) + # LLM Captions are included + for test_string in LLM_TEST_STRINGS: + assert test_string in result.text_content + # Standard alt text is included + validate_strings(result, PPTX_TEST_STRINGS) + + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + for test in [ + test_stream_info_operations, + test_docx_comments, + test_input_as_strings, + test_markitdown_remote, + test_speech_transcription, + test_exceptions, + test_markitdown_exiftool, + test_markitdown_llm, + ]: + print(f"Running {test.__name__}...", end="") + test() + print("OK") + print("All tests passed!") diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py index 62424f0..6b7a421 100644 --- a/packages/markitdown/tests/test_module_vectors.py +++ b/packages/markitdown/tests/test_module_vectors.py @@ -50,9 +50,7 @@ def test_guess_stream_info(test_vector): assert guesses[0].mimetype == test_vector.mimetype assert guesses[0].extension == expected_extension - assert _normalize_charset(guesses[0].charset) == _normalize_charset( - test_vector.charset - ) + assert guesses[0].charset == test_vector.charset @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) @@ -124,19 +122,6 @@ def test_convert_url(test_vector): assert string not in result.markdown -def _normalize_charset(charset: str | None) -> str | None: - """ - Normalize a charset string to a canonical form. - """ - if charset is None: - return None - - try: - return codecs.lookup(charset).name - except LookupError: - return charset - - if __name__ == "__main__": import sys