74 lines
2 KiB
Python
74 lines
2 KiB
Python
|
|
import os
|
||
|
|
from pathlib import Path
|
||
|
|
from warnings import catch_warnings, resetwarnings
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from markitdown import MarkItDown
|
||
|
|
|
||
|
|
TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"
|
||
|
|
|
||
|
|
# Don't run the llm tests without a key and the client library
|
||
|
|
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
|
||
|
|
try:
|
||
|
|
import openai
|
||
|
|
except ModuleNotFoundError:
|
||
|
|
skip_llm = True
|
||
|
|
|
||
|
|
LLM_TEST_STRINGS = [
|
||
|
|
"5bda1dd6",
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def test_markitdown_deprecation() -> None:
|
||
|
|
try:
|
||
|
|
with catch_warnings(record=True) as w:
|
||
|
|
test_client = object()
|
||
|
|
markitdown = MarkItDown(mlm_client=test_client)
|
||
|
|
assert len(w) == 1
|
||
|
|
assert w[0].category is DeprecationWarning
|
||
|
|
assert markitdown._llm_client == test_client
|
||
|
|
finally:
|
||
|
|
resetwarnings()
|
||
|
|
|
||
|
|
try:
|
||
|
|
with catch_warnings(record=True) as w:
|
||
|
|
markitdown = MarkItDown(mlm_model="gpt-4o")
|
||
|
|
assert len(w) == 1
|
||
|
|
assert w[0].category is DeprecationWarning
|
||
|
|
assert markitdown._llm_model == "gpt-4o"
|
||
|
|
finally:
|
||
|
|
resetwarnings()
|
||
|
|
|
||
|
|
try:
|
||
|
|
test_client = object()
|
||
|
|
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
|
||
|
|
assert False
|
||
|
|
except ValueError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
try:
|
||
|
|
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
|
||
|
|
assert False
|
||
|
|
except ValueError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.skipif(
|
||
|
|
skip_llm,
|
||
|
|
reason="do not run llm tests without a key",
|
||
|
|
)
|
||
|
|
def test_markitdown_llm() -> None:
|
||
|
|
client = openai.OpenAI()
|
||
|
|
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||
|
|
|
||
|
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
|
||
|
|
|
||
|
|
for test_string in LLM_TEST_STRINGS:
|
||
|
|
assert test_string in result.text_content
|
||
|
|
|
||
|
|
# This is not super precise. It would also accept "red square", "blue circle",
|
||
|
|
# "the square is not blue", etc. But it's sufficient for this test.
|
||
|
|
for test_string in ["red", "circle", "blue", "square"]:
|
||
|
|
assert test_string in result.text_content.lower()
|