markitdown/tests/core/test_llm.py

import os
from pathlib import Path
from warnings import catch_warnings, resetwarnings

import pytest

from markitdown import MarkItDown

TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"

# Don't run the llm tests without a key and the client library
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
try:
    import openai
except ModuleNotFoundError:
    skip_llm = True

LLM_TEST_STRINGS = [
    "5bda1dd6",
]


def test_markitdown_deprecation() -> None:
    try:
        with catch_warnings(record=True) as w:
            test_client = object()
            markitdown = MarkItDown(mlm_client=test_client)
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_client == test_client
    finally:
        resetwarnings()

    try:
        with catch_warnings(record=True) as w:
            markitdown = MarkItDown(mlm_model="gpt-4o")
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_model == "gpt-4o"
    finally:
        resetwarnings()

    try:
        test_client = object()
        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
        assert False
    except ValueError:
        pass

    try:
        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
        assert False
    except ValueError:
        pass


@pytest.mark.skipif(
    skip_llm,
    reason="do not run llm tests without a key",
)
def test_markitdown_llm() -> None:
    client = openai.OpenAI()
    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")

    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))

    for test_string in LLM_TEST_STRINGS:
        assert test_string in result.text_content

    # This is not super precise. It would also accept "red square", "blue circle",
    # "the square is not blue", etc. But it's sufficient for this test.
    for test_string in ["red", "circle", "blue", "square"]:
        assert test_string in result.text_content.lower()
feat(tests): add comprehensive tests for MarkItDown functionality Add new test cases for MarkItDown to cover LLM, remote, and local file conversions. Implement tests for handling deprecation warnings, external URL queries, and EXIF data processing. Ensure tests are skipped when necessary environment conditions are not met, improving test reliability and maintainability. 2024-12-26 10:15:07 +00:00			`import os`
			`from pathlib import Path`
			`from warnings import catch_warnings, resetwarnings`

			`import pytest`

			`from markitdown import MarkItDown`

			`TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"`

			`# Don't run the llm tests without a key and the client library`
			`skip_llm = False if os.environ.get("OPENAI_API_KEY") else True`
			`try:`
			`import openai`
			`except ModuleNotFoundError:`
			`skip_llm = True`

			`LLM_TEST_STRINGS = [`
			`"5bda1dd6",`
			`]`


			`def test_markitdown_deprecation() -> None:`
			`try:`
			`with catch_warnings(record=True) as w:`
			`test_client = object()`
			`markitdown = MarkItDown(mlm_client=test_client)`
			`assert len(w) == 1`
			`assert w[0].category is DeprecationWarning`
			`assert markitdown._llm_client == test_client`
			`finally:`
			`resetwarnings()`

			`try:`
			`with catch_warnings(record=True) as w:`
			`markitdown = MarkItDown(mlm_model="gpt-4o")`
			`assert len(w) == 1`
			`assert w[0].category is DeprecationWarning`
			`assert markitdown._llm_model == "gpt-4o"`
			`finally:`
			`resetwarnings()`

			`try:`
			`test_client = object()`
			`markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)`
			`assert False`
			`except ValueError:`
			`pass`

			`try:`
			`markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")`
			`assert False`
			`except ValueError:`
			`pass`


			`@pytest.mark.skipif(`
			`skip_llm,`
			`reason="do not run llm tests without a key",`
			`)`
			`def test_markitdown_llm() -> None:`
			`client = openai.OpenAI()`
			`markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")`

			`result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))`

			`for test_string in LLM_TEST_STRINGS:`
			`assert test_string in result.text_content`

			`# This is not super precise. It would also accept "red square", "blue circle",`
			`# "the square is not blue", etc. But it's sufficient for this test.`
			`for test_string in ["red", "circle", "blue", "square"]:`
			`assert test_string in result.text_content.lower()`