Adds tests for adding the “convert_local_content” method
This commit is contained in:
parent
f58a864951
commit
dbc93dd584
1 changed files with 67 additions and 0 deletions
|
|
@ -189,6 +189,73 @@ def test_markitdown_remote() -> None:
|
||||||
# assert test_string in result.text_content
|
# assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_markitdown_local_content() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test HTML processing (Blog)
|
||||||
|
blog_html_path = os.path.join(TEST_FILES_DIR, "test_blog.html")
|
||||||
|
with open(blog_html_path, "r", encoding="utf-8") as f:
|
||||||
|
blog_html_content = f.read()
|
||||||
|
result = markitdown.convert_local_content(
|
||||||
|
blog_html_content,
|
||||||
|
file_extension=".html",
|
||||||
|
url=BLOG_TEST_URL
|
||||||
|
)
|
||||||
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test Wikipedia HTML processing
|
||||||
|
wikipedia_html_path = os.path.join(TEST_FILES_DIR, "test_wikipedia.html")
|
||||||
|
with open(wikipedia_html_path, "r", encoding="utf-8") as f:
|
||||||
|
wikipedia_html_content = f.read()
|
||||||
|
result = markitdown.convert_local_content(
|
||||||
|
wikipedia_html_content,
|
||||||
|
file_extension=".html",
|
||||||
|
url=WIKIPEDIA_TEST_URL
|
||||||
|
)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||||
|
|
||||||
|
# Test Bing SERP HTML processing
|
||||||
|
serp_html_path = os.path.join(TEST_FILES_DIR, "test_serp.html")
|
||||||
|
with open(serp_html_path, "r", encoding="utf-8") as f:
|
||||||
|
serp_html_content = f.read()
|
||||||
|
result = markitdown.convert_local_content(
|
||||||
|
serp_html_content,
|
||||||
|
file_extension=".html",
|
||||||
|
url=SERP_TEST_URL
|
||||||
|
)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||||
|
|
||||||
|
# Test RSS processing
|
||||||
|
rss_path = os.path.join(TEST_FILES_DIR, "test_rss.xml")
|
||||||
|
with open(rss_path, "r", encoding="utf-8") as f:
|
||||||
|
rss_content = f.read()
|
||||||
|
result = markitdown.convert_local_content(rss_content, file_extension=".xml")
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in RSS_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test non-UTF-8 encoding (CSV CP932)
|
||||||
|
csv_cp932_path = os.path.join(TEST_FILES_DIR, "test_mskanji.csv")
|
||||||
|
with open(csv_cp932_path, "r", encoding="cp932") as f:
|
||||||
|
csv_cp932_content = f.read()
|
||||||
|
result = markitdown.convert_local_content(
|
||||||
|
csv_cp932_content,
|
||||||
|
file_extension=".csv",
|
||||||
|
encoding="cp932"
|
||||||
|
)
|
||||||
|
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test JSON processing
|
||||||
|
json_path = os.path.join(TEST_FILES_DIR, "test.json")
|
||||||
|
with open(json_path, "r", encoding="utf-8") as f:
|
||||||
|
json_content = f.read()
|
||||||
|
result = markitdown.convert_local_content(json_content, file_extension=".json")
|
||||||
|
validate_strings(result, JSON_TEST_STRINGS)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown_local() -> None:
|
def test_markitdown_local() -> None:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue