diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index b116927..2e9965a 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -244,7 +244,14 @@ class MarkItDown: or source.startswith("https://") or source.startswith("file://") ): - return self.convert_url(source, stream_info=stream_info, *kwargs) + # Rename the url argument to mock_url + # (Deprecated -- use stream_info) + _kwargs = {k: v for k, v in kwargs.items()} + if "url" in _kwargs: + _kwargs["mock_url"] = _kwargs["url"] + del _kwargs["url"] + + return self.convert_url(source, stream_info=stream_info, **_kwargs) else: return self.convert_local(source, stream_info=stream_info, **kwargs) # Path object @@ -350,12 +357,26 @@ class MarkItDown: return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) def convert_url( - self, url: str, **kwargs: Any + self, + url: str, + *, + stream_info: Optional[StreamInfo] = None, + file_extension: Optional[str] = None, # Deprecated -- use stream_info + mock_url: Optional[ + str + ] = None, # Mock the request as if it came from a different URL + **kwargs: Any, ) -> DocumentConverterResult: # TODO: fix kwargs type # Send a HTTP request to the URL response = self._requests_session.get(url, stream=True) response.raise_for_status() - return self.convert_response(response, **kwargs) + return self.convert_response( + response, + stream_info=stream_info, + file_extension=file_extension, + url=mock_url, + **kwargs, + ) def convert_response( self, @@ -660,10 +681,12 @@ class MarkItDown: return guesses - def _normalize_charset(self, charset: str) -> str: + def _normalize_charset(self, charset: str | None) -> str | None: """ Normalize a charset string to a canonical form. """ + if charset is None: + return None try: return codecs.lookup(charset).name except LookupError: diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py new file mode 100644 index 0000000..eeb83f1 --- /dev/null +++ b/packages/markitdown/tests/_test_vectors.py @@ -0,0 +1,230 @@ +import dataclasses +from typing import List + + +@dataclasses.dataclass(frozen=True, kw_only=True) +class TestVector(object): + filename: str + mimetype: str | None + charset: str | None + url: str | None + must_include: List[str] + must_not_include: List[str] + + +GENERAL_TEST_VECTORS = [ + TestVector( + filename="test.docx", + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + charset=None, + url=None, + must_include=[ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + ], + must_not_include=[], + ), + TestVector( + filename="test.xlsx", + mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + charset=None, + url=None, + must_include=[ + "## 09060124-b5e7-4717-9d07-3c046eb", + "6ff4173b-42a5-4784-9b19-f49caff4d93d", + "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", + ], + must_not_include=[], + ), + TestVector( + filename="test.xls", + mimetype="application/vnd.ms-excel", + charset=None, + url=None, + must_include=[ + "## 09060124-b5e7-4717-9d07-3c046eb", + "6ff4173b-42a5-4784-9b19-f49caff4d93d", + "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", + ], + must_not_include=[], + ), + TestVector( + filename="test.pptx", + mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", + charset=None, + url=None, + must_include=[ + "2cdda5c8-e50e-4db4-b5f0-9722a649f455", + "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", + "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", + "1b92870d-e3b5-4e65-8153-919f4ff45592", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value + ], + must_not_include=[], + ), + TestVector( + filename="test_outlook_msg.msg", + mimetype="application/vnd.ms-outlook", + charset=None, + url=None, + must_include=[ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", + ], + must_not_include=[], + ), + TestVector( + filename="test.pdf", + mimetype="application/pdf", + charset=None, + url=None, + must_include=[ + "While there is contemporaneous exploration of multi-agent approaches" + ], + must_not_include=[], + ), + # TestVector( + # filename='test_with_comment.docx', + # mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document', + # charset=None, + # must_include=[ + # "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + # "49e168b7-d2ae-407f-a055-2167576f39a1", + # "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + # "# Abstract", + # "# Introduction", + # "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + # "This is a test comment. 12df-321a", + # "Yet another comment in the doc. 55yiyi-asd09", + # ], + # must_not_include=[] + # ), + TestVector( + filename="test_blog.html", + mimetype="text/html", + charset="utf-8", + url="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math", + must_include=[ + "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", + "an example where high cost can easily prevent a generic complex", + ], + must_not_include=[], + ), + TestVector( + filename="test_wikipedia.html", + mimetype="text/html", + charset="utf-8", + url="https://en.wikipedia.org/wiki/Microsoft", + must_include=[ + "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", + 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', + ], + must_not_include=[ + "You are encouraged to create an account and log in", + "154 languages", + "move to sidebar", + ], + ), + TestVector( + filename="test_serp.html", + mimetype="text/html", + charset="utf-8", + url="https://www.bing.com/search?q=microsoft+wikipedia", + must_include=[ + "](https://en.wikipedia.org/wiki/Microsoft", + "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", + "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", + ], + must_not_include=[ + "https://www.bing.com/ck/a?!&&p=", + "data:image/svg+xml,%3Csvg%20width%3D", + ], + ), + TestVector( + filename="test_mskanji.csv", + mimetype="text/csv", + charset="mskanji", + url=None, + must_include=[ + "名前,年齢,住所", + "佐藤太郎,30,東京", + "三木英子,25,大阪", + "髙橋淳,35,名古屋", + ], + must_not_include=[], + ), + TestVector( + filename="test.json", + mimetype="application/json", + charset="ascii", + url=None, + must_include=[ + "5b64c88c-b3c3-4510-bcb8-da0b200602d8", + "9700dc99-6685-40b4-9a3a-5e406dcb37f3", + ], + must_not_include=[], + ), + TestVector( + filename="test_rss.xml", + mimetype="text/xml", + charset="utf-8", + url=None, + must_include=[ + "# The Official Microsoft Blog", + "## Ignite 2024: Why nearly 70% of the Fortune 500 now use Microsoft 365 Copilot", + "In the case of AI, it is absolutely true that the industry is moving incredibly fast", + ], + must_not_include=[" None: - result = subprocess.run( - ["python", "-m", "markitdown", "--version"], capture_output=True, text=True - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - assert __version__ in result.stdout, f"Version not found in output: {result.stdout}" - - -def test_invalid_flag(shared_tmp_dir) -> None: - result = subprocess.run( - ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True - ) - - assert result.returncode != 0, f"CLI exited with error: {result.stderr}" - assert ( - "unrecognized arguments" in result.stderr - ), f"Expected 'unrecognized arguments' to appear in STDERR" - assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR" - - -def test_output_to_stdout(shared_tmp_dir) -> None: - # DOC X - result = subprocess.run( - ["python", "-m", "markitdown", os.path.join(TEST_FILES_DIR, "test.docx")], - capture_output=True, - text=True, - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - for test_string in DOCX_TEST_STRINGS: - assert ( - test_string in result.stdout - ), f"Expected string not found in output: {test_string}" - - -def test_output_to_file(shared_tmp_dir) -> None: - # DOC X, flag -o at the end - docx_output_file_1 = os.path.join(shared_tmp_dir, "test_docx_1.md") - result = subprocess.run( - [ - "python", - "-m", - "markitdown", - os.path.join(TEST_FILES_DIR, "test.docx"), - "-o", - docx_output_file_1, - ], - capture_output=True, - text=True, - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - assert os.path.exists( - docx_output_file_1 - ), f"Output file not created: {docx_output_file_1}" - - with open(docx_output_file_1, "r") as f: - output = f.read() - for test_string in DOCX_TEST_STRINGS: - assert ( - test_string in output - ), f"Expected string not found in output: {test_string}" - - # DOC X, flag -o at the beginning - docx_output_file_2 = os.path.join(shared_tmp_dir, "test_docx_2.md") - result = subprocess.run( - [ - "python", - "-m", - "markitdown", - "-o", - docx_output_file_2, - os.path.join(TEST_FILES_DIR, "test.docx"), - ], - capture_output=True, - text=True, - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - assert os.path.exists( - docx_output_file_2 - ), f"Output file not created: {docx_output_file_2}" - - with open(docx_output_file_2, "r") as f: - output = f.read() - for test_string in DOCX_TEST_STRINGS: - assert ( - test_string in output - ), f"Expected string not found in output: {test_string}" - - -if __name__ == "__main__": - """Runs this file's tests from the command line.""" - import tempfile - - with tempfile.TemporaryDirectory() as tmp_dir: - test_version(tmp_dir) - test_invalid_flag(tmp_dir) - test_output_to_stdout(tmp_dir) - test_output_to_file(tmp_dir) - print("All tests passed!") diff --git a/packages/markitdown/tests/test_cli_misc.py b/packages/markitdown/tests/test_cli_misc.py new file mode 100644 index 0000000..d301654 --- /dev/null +++ b/packages/markitdown/tests/test_cli_misc.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 -m pytest +import subprocess +import pytest +from markitdown import __version__ + + +def test_version() -> None: + result = subprocess.run( + ["python", "-m", "markitdown", "--version"], capture_output=True, text=True + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert __version__ in result.stdout, f"Version not found in output: {result.stdout}" + + +def test_invalid_flag() -> None: + result = subprocess.run( + ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True + ) + + assert result.returncode != 0, f"CLI exited with error: {result.stderr}" + assert ( + "unrecognized arguments" in result.stderr + ), f"Expected 'unrecognized arguments' to appear in STDERR" + assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR" + + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + test_version() + test_invalid_flag() + print("All tests passed!") diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py new file mode 100644 index 0000000..9d4ad17 --- /dev/null +++ b/packages/markitdown/tests/test_cli_vectors.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 -m pytest +import os +import time +import pytest +import subprocess +import locale +from typing import List + +if __name__ == "__main__": + from _test_vectors import GENERAL_TEST_VECTORS, TestVector +else: + from ._test_vectors import GENERAL_TEST_VECTORS, TestVector + +from markitdown import ( + MarkItDown, + UnsupportedFormatException, + FileConversionException, + StreamInfo, +) + +skip_remote = ( + True if os.environ.get("GITHUB_ACTIONS") else False +) # Don't run these tests in CI + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" + + +# Prepare CLI test vectors (remove vectors that require mockig the url) +CLI_TEST_VECTORS: List[TestVector] = [] +for test_vector in GENERAL_TEST_VECTORS: + if test_vector.url is not None: + continue + CLI_TEST_VECTORS.append(test_vector) + + +@pytest.fixture(scope="session") +def shared_tmp_dir(tmp_path_factory): + return tmp_path_factory.mktemp("pytest_tmp") + + +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_output_to_stdout(shared_tmp_dir, test_vector) -> None: + """Test that the CLI outputs to stdout correctly.""" + + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + for test_string in test_vector.must_include: + assert test_string in result.stdout + for test_string in test_vector.must_not_include: + assert test_string not in result.stdout + + +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_output_to_file(shared_tmp_dir, test_vector) -> None: + """Test that the CLI outputs to a file correctly.""" + + output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + "-o", + output_file, + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert os.path.exists(output_file), f"Output file not created: {output_file}" + + with open(output_file, "r") as f: + output_data = f.read() + for test_string in test_vector.must_include: + assert test_string in output_data + for test_string in test_vector.must_not_include: + assert test_string not in output_data + + os.remove(output_file) + assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" + + +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None: + """Test that the CLI readds from stdin correctly.""" + + test_input = b"" + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + test_input = stream.read() + + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + input=test_input, + capture_output=True, + text=False, + ) + + stdout = result.stdout.decode(locale.getpreferredencoding()) + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + for test_string in test_vector.must_include: + assert test_string in stdout + for test_string in test_vector.must_not_include: + assert test_string not in stdout + + +@pytest.mark.skipif( + skip_remote, + reason="do not run tests that query external urls", +) +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_convert_url(tmp_dir, test_vector): + """Test the conversion of a stream with no stream info.""" + # Note: tmp_dir is not used here, but is needed to match the signature + + markitdown = MarkItDown() + + time.sleep(1) # Ensure we don't hit rate limits + result = subprocess.run( + ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename], + capture_output=True, + text=False, + ) + + stdout = result.stdout.decode(locale.getpreferredencoding()) + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + for test_string in test_vector.must_include: + assert test_string in stdout + for test_string in test_vector.must_not_include: + assert test_string not in stdout + + +if __name__ == "__main__": + import sys + import tempfile + + """Runs this file's tests from the command line.""" + + with tempfile.TemporaryDirectory() as tmp_dir: + for test_function in [ + test_output_to_stdout, + test_output_to_file, + test_input_from_stdin_without_hints, + test_convert_url, + ]: + for test_vector in CLI_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", + end="", + ) + test_function(tmp_dir, test_vector) + print("OK") + print("All tests passed!") diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py deleted file mode 100644 index f76ff8c..0000000 --- a/packages/markitdown/tests/test_markitdown.py +++ /dev/null @@ -1,585 +0,0 @@ -#!/usr/bin/env python3 -m pytest -import io -import os -import shutil -import openai - -import pytest -import requests - -from markitdown import ( - MarkItDown, - UnsupportedFormatException, - FileConversionException, - StreamInfo, -) - -skip_remote = ( - True if os.environ.get("GITHUB_ACTIONS") else False -) # Don't run these tests in CI - - -# Don't run the llm tests without a key and the client library -skip_llm = False if os.environ.get("OPENAI_API_KEY") else True -try: - import openai -except ModuleNotFoundError: - skip_llm = True - -# Skip exiftool tests if not installed -skip_exiftool = shutil.which("exiftool") is None - -TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") - -JPG_TEST_EXIFTOOL = { - "Author": "AutoGen Authors", - "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", - "Description": "AutoGen enables diverse LLM-based applications", - "ImageSize": "1615x1967", - "DateTimeOriginal": "2024:03:14 22:10:00", -} - -MP3_TEST_EXIFTOOL = { - "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e", - "Artist": "Artist Name Test String", - "Album": "Album Name Test String", - "SampleRate": "48000", -} - -PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" -PDF_TEST_STRINGS = [ - "While there is contemporaneous exploration of multi-agent approaches" -] - -YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg" -YOUTUBE_TEST_STRINGS = [ - "## AutoGen FULL Tutorial with Python (Step-By-Step)", - "This is an intermediate tutorial for installing and using AutoGen locally", - "PT15M4S", - "the model we're going to be using today is GPT 3.5 turbo", # From the transcript -] - -XLSX_TEST_STRINGS = [ - "## 09060124-b5e7-4717-9d07-3c046eb", - "6ff4173b-42a5-4784-9b19-f49caff4d93d", - "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", -] - -XLS_TEST_STRINGS = [ - "## 09060124-b5e7-4717-9d07-3c046eb", - "6ff4173b-42a5-4784-9b19-f49caff4d93d", - "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", -] - -DOCX_TEST_STRINGS = [ - "314b0a30-5b04-470b-b9f7-eed2c2bec74a", - "49e168b7-d2ae-407f-a055-2167576f39a1", - "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", - "# Abstract", - "# Introduction", - "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", -] - -MSG_TEST_STRINGS = [ - "# Email Message", - "**From:** test.sender@example.com", - "**To:** test.recipient@example.com", - "**Subject:** Test Email Message", - "## Content", - "This is the body of the test email message", -] - -DOCX_COMMENT_TEST_STRINGS = [ - "314b0a30-5b04-470b-b9f7-eed2c2bec74a", - "49e168b7-d2ae-407f-a055-2167576f39a1", - "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", - "# Abstract", - "# Introduction", - "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", - "This is a test comment. 12df-321a", - "Yet another comment in the doc. 55yiyi-asd09", -] - -PPTX_TEST_STRINGS = [ - "2cdda5c8-e50e-4db4-b5f0-9722a649f455", - "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", - "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", - "1b92870d-e3b5-4e65-8153-919f4ff45592", - "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", - "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title - "2003", # chart value -] - -BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" -BLOG_TEST_STRINGS = [ - "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", - "an example where high cost can easily prevent a generic complex", -] - - -RSS_TEST_STRINGS = [ - "The Official Microsoft Blog", - "In the case of AI, it is absolutely true that the industry is moving incredibly fast", -] - - -WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" -WIKIPEDIA_TEST_STRINGS = [ - "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", - 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', -] -WIKIPEDIA_TEST_EXCLUDES = [ - "You are encouraged to create an account and log in", - "154 languages", - "move to sidebar", -] - -SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia" -SERP_TEST_STRINGS = [ - "](https://en.wikipedia.org/wiki/Microsoft", - "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", - "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", -] -SERP_TEST_EXCLUDES = [ - "https://www.bing.com/ck/a?!&&p=", - "data:image/svg+xml,%3Csvg%20width%3D", -] - -CSV_CP932_TEST_STRINGS = [ - "名前,年齢,住所", - "佐藤太郎,30,東京", - "三木英子,25,大阪", - "髙橋淳,35,名古屋", -] - -LLM_TEST_STRINGS = [ - "5bda1dd6", -] - -JSON_TEST_STRINGS = [ - "5b64c88c-b3c3-4510-bcb8-da0b200602d8", - "9700dc99-6685-40b4-9a3a-5e406dcb37f3", -] - - -# --- Helper Functions --- -def validate_strings(result, expected_strings, exclude_strings=None): - """Validate presence or absence of specific strings.""" - text_content = result.text_content.replace("\\", "") - for string in expected_strings: - assert string in text_content - if exclude_strings: - for string in exclude_strings: - assert string not in text_content - - -def test_stream_info_operations() -> None: - """Test operations performed on StreamInfo objects.""" - - stream_info_original = StreamInfo( - mimetype="mimetype.1", - extension="extension.1", - charset="charset.1", - filename="filename.1", - local_path="local_path.1", - url="url.1", - ) - - # Check updating all attributes by keyword - keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] - for keyword in keywords: - updated_stream_info = stream_info_original.copy_and_update( - **{keyword: f"{keyword}.2"} - ) - - # Make sure the targted attribute is updated - assert getattr(updated_stream_info, keyword) == f"{keyword}.2" - - # Make sure the other attributes are unchanged - for k in keywords: - if k != keyword: - assert getattr(stream_info_original, k) == getattr( - updated_stream_info, k - ) - - # Check updating all attributes by passing a new StreamInfo object - keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] - for keyword in keywords: - updated_stream_info = stream_info_original.copy_and_update( - StreamInfo(**{keyword: f"{keyword}.2"}) - ) - - # Make sure the targted attribute is updated - assert getattr(updated_stream_info, keyword) == f"{keyword}.2" - - # Make sure the other attributes are unchanged - for k in keywords: - if k != keyword: - assert getattr(stream_info_original, k) == getattr( - updated_stream_info, k - ) - - # Check mixing and matching - updated_stream_info = stream_info_original.copy_and_update( - StreamInfo(extension="extension.2", filename="filename.2"), - mimetype="mimetype.3", - charset="charset.3", - ) - assert updated_stream_info.extension == "extension.2" - assert updated_stream_info.filename == "filename.2" - assert updated_stream_info.mimetype == "mimetype.3" - assert updated_stream_info.charset == "charset.3" - assert updated_stream_info.local_path == "local_path.1" - assert updated_stream_info.url == "url.1" - - # Check multiple StreamInfo objects - updated_stream_info = stream_info_original.copy_and_update( - StreamInfo(extension="extension.4", filename="filename.5"), - StreamInfo(mimetype="mimetype.6", charset="charset.7"), - ) - assert updated_stream_info.extension == "extension.4" - assert updated_stream_info.filename == "filename.5" - assert updated_stream_info.mimetype == "mimetype.6" - assert updated_stream_info.charset == "charset.7" - assert updated_stream_info.local_path == "local_path.1" - assert updated_stream_info.url == "url.1" - - -def test_stream_info_guesses() -> None: - """Test StreamInfo guesses based on stream content.""" - - test_tuples = [ - ( - os.path.join(TEST_FILES_DIR, "test.xlsx"), - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), - ( - os.path.join(TEST_FILES_DIR, "test.docx"), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), - ( - os.path.join(TEST_FILES_DIR, "test.pptx"), - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), - (os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"), - ] - - markitdown = MarkItDown() - for file_path, expected_mimetype in test_tuples: - with open(file_path, "rb") as f: - guesses = markitdown._get_stream_info_guesses( - f, - StreamInfo( - filename=os.path.basename(file_path), - local_path=file_path, - extension=os.path.splitext(file_path)[1], - ), - ) - assert len(guesses) > 0 - assert guesses[0].mimetype == expected_mimetype - assert guesses[0].extension == os.path.splitext(file_path)[1] - - -@pytest.mark.skipif( - skip_remote, - reason="do not run tests that query external urls", -) -def test_markitdown_remote() -> None: - markitdown = MarkItDown() - - # By URL - result = markitdown.convert(PDF_TEST_URL) - for test_string in PDF_TEST_STRINGS: - assert test_string in result.text_content - - # By stream - response = requests.get(PDF_TEST_URL) - result = markitdown.convert_stream( - io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL - ) - for test_string in PDF_TEST_STRINGS: - assert test_string in result.text_content - - # Youtube - result = markitdown.convert(YOUTUBE_TEST_URL) - for test_string in YOUTUBE_TEST_STRINGS: - assert test_string in result.text_content - - -def test_markitdown_local() -> None: - markitdown = MarkItDown() - - # Test PDF processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf")) - validate_strings(result, PDF_TEST_STRINGS) - - # Test XLSX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - validate_strings(result, XLSX_TEST_STRINGS) - - # Test XLS processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) - for test_string in XLS_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) - validate_strings(result, DOCX_TEST_STRINGS) - - # Test DOCX processing, with comments - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), - style_map="comment-reference => ", - ) - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - - # Test DOCX processing, with comments and setting style_map on init - markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") - result = markitdown_with_style_map.convert( - os.path.join(TEST_FILES_DIR, "test_with_comment.docx") - ) - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - - # Test PPTX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) - validate_strings(result, PPTX_TEST_STRINGS) - - # Test HTML processing - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL - ) - validate_strings(result, BLOG_TEST_STRINGS) - - # Test Wikipedia processing - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL - ) - text_content = result.text_content.replace("\\", "") - validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) - - # Test Bing processing - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL - ) - text_content = result.text_content.replace("\\", "") - validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) - - # Test RSS processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml")) - text_content = result.text_content.replace("\\", "") - for test_string in RSS_TEST_STRINGS: - assert test_string in text_content - - # Test MSG (Outlook email) processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) - validate_strings(result, MSG_TEST_STRINGS) - - # Test non-UTF-8 encoding - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) - validate_strings(result, CSV_CP932_TEST_STRINGS) - - # Test JSON processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) - validate_strings(result, JSON_TEST_STRINGS) - - # # Test ZIP file processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - validate_strings(result, DOCX_TEST_STRINGS) - validate_strings(result, XLSX_TEST_STRINGS) - validate_strings(result, BLOG_TEST_STRINGS) - - # Test input from a stream - input_data = b"

Test

" - result = markitdown.convert_stream(io.BytesIO(input_data)) - assert "# Test" in result.text_content - - # Test input with leading blank characters - input_data = b" \n\n\n

Test

" - result = markitdown.convert_stream(io.BytesIO(input_data)) - assert "# Test" in result.text_content - - -def test_markitdown_streams() -> None: - markitdown = MarkItDown() - - # Test PDF processing - with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f: - result = markitdown.convert(f, file_extension=".pdf") - validate_strings(result, PDF_TEST_STRINGS) - - # Test XLSX processing - with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f: - result = markitdown.convert(f, file_extension=".xlsx") - validate_strings(result, XLSX_TEST_STRINGS) - - # Test XLS processing - with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f: - result = markitdown.convert(f, file_extension=".xls") - for test_string in XLS_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing - with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f: - result = markitdown.convert(f, file_extension=".docx") - validate_strings(result, DOCX_TEST_STRINGS) - - # Test DOCX processing, with comments - with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: - result = markitdown.convert( - f, - file_extension=".docx", - style_map="comment-reference => ", - ) - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - - # Test DOCX processing, with comments and setting style_map on init - markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") - with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: - result = markitdown_with_style_map.convert(f, file_extension=".docx") - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - - # Test PPTX processing - with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f: - result = markitdown.convert(f, file_extension=".pptx") - validate_strings(result, PPTX_TEST_STRINGS) - - # Test HTML processing - with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f: - result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL) - validate_strings(result, BLOG_TEST_STRINGS) - - # Test Wikipedia processing - with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f: - result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL) - text_content = result.text_content.replace("\\", "") - validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) - - # Test Bing processing - with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f: - result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL) - text_content = result.text_content.replace("\\", "") - validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) - - # Test RSS processing - with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f: - result = markitdown.convert(f, file_extension=".xml") - text_content = result.text_content.replace("\\", "") - for test_string in RSS_TEST_STRINGS: - assert test_string in text_content - - # Test MSG (Outlook email) processing - with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f: - result = markitdown.convert(f, file_extension=".msg") - validate_strings(result, MSG_TEST_STRINGS) - - # Test JSON processing - with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f: - result = markitdown.convert(f, file_extension=".json") - validate_strings(result, JSON_TEST_STRINGS) - - -@pytest.mark.skipif( - skip_remote, - reason="do not run remotely run speech transcription tests", -) -def test_speech_transcription() -> None: - markitdown = MarkItDown() - - # Test WAV files, MP3 and M4A files - for file_name in ["test.wav", "test.mp3", "test.m4a"]: - result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name)) - result_lower = result.text_content.lower() - assert ( - ("1" in result_lower or "one" in result_lower) - and ("2" in result_lower or "two" in result_lower) - and ("3" in result_lower or "three" in result_lower) - and ("4" in result_lower or "four" in result_lower) - and ("5" in result_lower or "five" in result_lower) - ) - - -def test_exceptions() -> None: - # Check that an exception is raised when trying to convert an unsupported format - markitdown = MarkItDown() - with pytest.raises(UnsupportedFormatException): - markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin")) - - # Check that an exception is raised when trying to convert a file that is corrupted - with pytest.raises(FileConversionException) as exc_info: - markitdown.convert( - os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx" - ) - assert len(exc_info.value.attempts) == 1 - assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" - - -@pytest.mark.skipif( - skip_exiftool, - reason="do not run if exiftool is not installed", -) -def test_markitdown_exiftool() -> None: - which_exiftool = shutil.which("exiftool") - assert which_exiftool is not None - - # Test explicitly setting the location of exiftool - markitdown = MarkItDown(exiftool_path=which_exiftool) - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) - for key in JPG_TEST_EXIFTOOL: - target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" - assert target in result.text_content - - # Test setting the exiftool path through an environment variable - os.environ["EXIFTOOL_PATH"] = which_exiftool - markitdown = MarkItDown() - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) - for key in JPG_TEST_EXIFTOOL: - target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" - assert target in result.text_content - - # Test some other media types - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3")) - for key in MP3_TEST_EXIFTOOL: - target = f"{key}: {MP3_TEST_EXIFTOOL[key]}" - assert target in result.text_content - - -@pytest.mark.skipif( - skip_llm, - reason="do not run llm tests without a key", -) -def test_markitdown_llm() -> None: - client = openai.OpenAI() - markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") - - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) - for test_string in LLM_TEST_STRINGS: - assert test_string in result.text_content - - # This is not super precise. It would also accept "red square", "blue circle", - # "the square is not blue", etc. But it's sufficient for this test. - for test_string in ["red", "circle", "blue", "square"]: - assert test_string in result.text_content.lower() - - # Images embedded in PPTX files - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) - # LLM Captions are included - for test_string in LLM_TEST_STRINGS: - assert test_string in result.text_content - # Standard alt text is included - validate_strings(result, PPTX_TEST_STRINGS) - - -if __name__ == "__main__": - """Runs this file's tests from the command line.""" - test_stream_info_operations() - test_stream_info_guesses() - test_markitdown_remote() - test_markitdown_local() - test_markitdown_streams() - test_speech_transcription() - test_exceptions() - test_markitdown_exiftool() - test_markitdown_llm() - print("All tests passed!") diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py new file mode 100644 index 0000000..62424f0 --- /dev/null +++ b/packages/markitdown/tests/test_module_vectors.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 -m pytest +import os +import time +import pytest +import codecs + + +if __name__ == "__main__": + from _test_vectors import GENERAL_TEST_VECTORS +else: + from ._test_vectors import GENERAL_TEST_VECTORS + +from markitdown import ( + MarkItDown, + UnsupportedFormatException, + FileConversionException, + StreamInfo, +) + +skip_remote = ( + True if os.environ.get("GITHUB_ACTIONS") else False +) # Don't run these tests in CI + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_guess_stream_info(test_vector): + """Test the ability to guess stream info.""" + markitdown = MarkItDown() + + local_path = os.path.join(TEST_FILES_DIR, test_vector.filename) + expected_extension = os.path.splitext(test_vector.filename)[1] + + with open(local_path, "rb") as stream: + guesses = markitdown._get_stream_info_guesses( + stream, + base_guess=StreamInfo( + filename=os.path.basename(test_vector.filename), + local_path=local_path, + extension=expected_extension, + ), + ) + + # For some limited exceptions, we can't guarantee the exact + # mimetype or extension, so we'll special-case them here. + if test_vector.filename in ["test_outlook_msg.msg"]: + return + + assert guesses[0].mimetype == test_vector.mimetype + assert guesses[0].extension == expected_extension + assert _normalize_charset(guesses[0].charset) == _normalize_charset( + test_vector.charset + ) + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_local(test_vector): + """Test the conversion of a local file.""" + markitdown = MarkItDown() + + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url + ) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_stream_with_hints(test_vector): + """Test the conversion of a stream with full stream info.""" + markitdown = MarkItDown() + + stream_info = StreamInfo( + extension=os.path.splitext(test_vector.filename)[1], + mimetype=test_vector.mimetype, + charset=test_vector.charset, + ) + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert( + stream, stream_info=stream_info, url=test_vector.url + ) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_stream_without_hints(test_vector): + """Test the conversion of a stream with no stream info.""" + markitdown = MarkItDown() + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert(stream, url=test_vector.url) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.skipif( + skip_remote, + reason="do not run tests that query external urls", +) +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_url(test_vector): + """Test the conversion of a stream with no stream info.""" + markitdown = MarkItDown() + + time.sleep(1) # Ensure we don't hit rate limits + + result = markitdown.convert( + TEST_FILES_URL + "/" + test_vector.filename, + url=test_vector.url, # Mock where this file would be found + ) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +def _normalize_charset(charset: str | None) -> str | None: + """ + Normalize a charset string to a canonical form. + """ + if charset is None: + return None + + try: + return codecs.lookup(charset).name + except LookupError: + return charset + + +if __name__ == "__main__": + import sys + + """Runs this file's tests from the command line.""" + for test_function in [ + test_guess_stream_info, + test_convert_local, + test_convert_stream_with_hints, + test_convert_stream_without_hints, + test_convert_url, + ]: + for test_vector in GENERAL_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", end="" + ) + test_function(test_vector) + print("OK") + print("All tests passed!")