From dbc93dd584d368b35571a8707b96ac00d44c2cfe Mon Sep 17 00:00:00 2001 From: Athroniaeth Date: Tue, 21 Jan 2025 23:58:22 +0100 Subject: [PATCH] =?UTF-8?q?Adds=20tests=20for=20adding=20the=20=E2=80=9Cco?= =?UTF-8?q?nvert=5Flocal=5Fcontent=E2=80=9D=20method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_markitdown.py | 67 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 689d6f3..bdaffb5 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -189,6 +189,73 @@ def test_markitdown_remote() -> None: # assert test_string in result.text_content +def test_markitdown_local_content() -> None: + markitdown = MarkItDown() + + # Test HTML processing (Blog) + blog_html_path = os.path.join(TEST_FILES_DIR, "test_blog.html") + with open(blog_html_path, "r", encoding="utf-8") as f: + blog_html_content = f.read() + result = markitdown.convert_local_content( + blog_html_content, + file_extension=".html", + url=BLOG_TEST_URL + ) + validate_strings(result, BLOG_TEST_STRINGS) + + # Test Wikipedia HTML processing + wikipedia_html_path = os.path.join(TEST_FILES_DIR, "test_wikipedia.html") + with open(wikipedia_html_path, "r", encoding="utf-8") as f: + wikipedia_html_content = f.read() + result = markitdown.convert_local_content( + wikipedia_html_content, + file_extension=".html", + url=WIKIPEDIA_TEST_URL + ) + text_content = result.text_content.replace("\\", "") + validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) + + # Test Bing SERP HTML processing + serp_html_path = os.path.join(TEST_FILES_DIR, "test_serp.html") + with open(serp_html_path, "r", encoding="utf-8") as f: + serp_html_content = f.read() + result = markitdown.convert_local_content( + serp_html_content, + file_extension=".html", + url=SERP_TEST_URL + ) + text_content = result.text_content.replace("\\", "") + validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) + + # Test RSS processing + rss_path = os.path.join(TEST_FILES_DIR, "test_rss.xml") + with open(rss_path, "r", encoding="utf-8") as f: + rss_content = f.read() + result = markitdown.convert_local_content(rss_content, file_extension=".xml") + text_content = result.text_content.replace("\\", "") + for test_string in RSS_TEST_STRINGS: + assert test_string in text_content + + # Test non-UTF-8 encoding (CSV CP932) + csv_cp932_path = os.path.join(TEST_FILES_DIR, "test_mskanji.csv") + with open(csv_cp932_path, "r", encoding="cp932") as f: + csv_cp932_content = f.read() + result = markitdown.convert_local_content( + csv_cp932_content, + file_extension=".csv", + encoding="cp932" + ) + validate_strings(result, CSV_CP932_TEST_STRINGS) + + # Test JSON processing + json_path = os.path.join(TEST_FILES_DIR, "test.json") + with open(json_path, "r", encoding="utf-8") as f: + json_content = f.read() + result = markitdown.convert_local_content(json_content, file_extension=".json") + validate_strings(result, JSON_TEST_STRINGS) + + + def test_markitdown_local() -> None: markitdown = MarkItDown()