2024-11-13 21:00:01 +00:00
#!/usr/bin/env python3 -m pytest
import io
import os
2025-03-28 22:36:38 +00:00
import re
2024-11-13 21:00:01 +00:00
import shutil
2025-03-06 05:16:55 +00:00
import openai
2024-11-13 21:00:01 +00:00
import pytest
2025-03-25 04:43:04 +00:00
from markitdown . _uri_utils import parse_data_uri , file_uri_to_path
2025-03-06 05:16:55 +00:00
from markitdown import (
MarkItDown ,
UnsupportedFormatException ,
FileConversionException ,
StreamInfo ,
)
2024-11-13 21:00:01 +00:00
2025-03-12 18:08:06 +00:00
# This file contains module tests that are not directly tested by the FileTestVectors.
# This includes things like helper functions and runtime conversion options
# (e.g., LLM clients, exiftool path, transcription services, etc.)
2024-11-13 22:37:47 +00:00
skip_remote = (
True if os . environ . get ( " GITHUB_ACTIONS " ) else False
) # Don't run these tests in CI
2024-12-17 20:13:19 +00:00
# Don't run the llm tests without a key and the client library
skip_llm = False if os . environ . get ( " OPENAI_API_KEY " ) else True
try :
import openai
except ModuleNotFoundError :
skip_llm = True
# Skip exiftool tests if not installed
2024-11-13 21:00:01 +00:00
skip_exiftool = shutil . which ( " exiftool " ) is None
TEST_FILES_DIR = os . path . join ( os . path . dirname ( __file__ ) , " test_files " )
JPG_TEST_EXIFTOOL = {
" Author " : " AutoGen Authors " ,
" Title " : " AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
" Description " : " AutoGen enables diverse LLM-based applications " ,
" ImageSize " : " 1615x1967 " ,
" DateTimeOriginal " : " 2024:03:14 22:10:00 " ,
}
2025-03-06 05:16:55 +00:00
MP3_TEST_EXIFTOOL = {
" Title " : " f67a499e-a7d0-4ca3-a49b-358bd934ae3e " ,
" Artist " : " Artist Name Test String " ,
" Album " : " Album Name Test String " ,
" SampleRate " : " 48000 " ,
}
2024-11-13 21:00:01 +00:00
PDF_TEST_URL = " https://arxiv.org/pdf/2308.08155v2.pdf "
PDF_TEST_STRINGS = [
" While there is contemporaneous exploration of multi-agent approaches "
]
YOUTUBE_TEST_URL = " https://www.youtube.com/watch?v=V2qZ_lgxTzg "
YOUTUBE_TEST_STRINGS = [
" ## AutoGen FULL Tutorial with Python (Step-By-Step) " ,
" This is an intermediate tutorial for installing and using AutoGen locally " ,
" PT15M4S " ,
" the model we ' re going to be using today is GPT 3.5 turbo " , # From the transcript
]
2024-12-15 14:59:21 +00:00
DOCX_COMMENT_TEST_STRINGS = [
" 314b0a30-5b04-470b-b9f7-eed2c2bec74a " ,
" 49e168b7-d2ae-407f-a055-2167576f39a1 " ,
" ## d666f1f7-46cb-42bd-9a39-9a39cf2a509f " ,
" # Abstract " ,
" # Introduction " ,
" AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
" This is a test comment. 12df-321a " ,
" Yet another comment in the doc. 55yiyi-asd09 " ,
]
2024-11-13 21:00:01 +00:00
BLOG_TEST_URL = " https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math "
BLOG_TEST_STRINGS = [
" Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters? " ,
" an example where high cost can easily prevent a generic complex " ,
]
2024-12-17 20:13:19 +00:00
LLM_TEST_STRINGS = [
" 5bda1dd6 " ,
]
2025-03-12 18:08:06 +00:00
PPTX_TEST_STRINGS = [
" 2cdda5c8-e50e-4db4-b5f0-9722a649f455 " ,
" 04191ea8-5c73-4215-a1d3-1cfb43aaaf12 " ,
" 44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a " ,
" 1b92870d-e3b5-4e65-8153-919f4ff45592 " ,
" AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
" a3f6004b-6f4f-4ea8-bee3-3741f4dc385f " , # chart title
" 2003 " , # chart value
2025-01-04 00:40:43 +00:00
]
2024-11-13 21:00:01 +00:00
2024-12-20 19:42:32 +00:00
# --- Helper Functions ---
def validate_strings ( result , expected_strings , exclude_strings = None ) :
""" Validate presence or absence of specific strings. """
text_content = result . text_content . replace ( " \\ " , " " )
for string in expected_strings :
assert string in text_content
if exclude_strings :
for string in exclude_strings :
assert string not in text_content
2025-03-06 05:16:55 +00:00
def test_stream_info_operations ( ) - > None :
""" Test operations performed on StreamInfo objects. """
stream_info_original = StreamInfo (
mimetype = " mimetype.1 " ,
extension = " extension.1 " ,
charset = " charset.1 " ,
filename = " filename.1 " ,
local_path = " local_path.1 " ,
url = " url.1 " ,
)
# Check updating all attributes by keyword
keywords = [ " mimetype " , " extension " , " charset " , " filename " , " local_path " , " url " ]
for keyword in keywords :
updated_stream_info = stream_info_original . copy_and_update (
* * { keyword : f " { keyword } .2 " }
)
# Make sure the targted attribute is updated
assert getattr ( updated_stream_info , keyword ) == f " { keyword } .2 "
# Make sure the other attributes are unchanged
for k in keywords :
if k != keyword :
assert getattr ( stream_info_original , k ) == getattr (
updated_stream_info , k
)
# Check updating all attributes by passing a new StreamInfo object
keywords = [ " mimetype " , " extension " , " charset " , " filename " , " local_path " , " url " ]
for keyword in keywords :
updated_stream_info = stream_info_original . copy_and_update (
StreamInfo ( * * { keyword : f " { keyword } .2 " } )
)
# Make sure the targted attribute is updated
assert getattr ( updated_stream_info , keyword ) == f " { keyword } .2 "
# Make sure the other attributes are unchanged
for k in keywords :
if k != keyword :
assert getattr ( stream_info_original , k ) == getattr (
updated_stream_info , k
)
# Check mixing and matching
updated_stream_info = stream_info_original . copy_and_update (
StreamInfo ( extension = " extension.2 " , filename = " filename.2 " ) ,
mimetype = " mimetype.3 " ,
charset = " charset.3 " ,
)
assert updated_stream_info . extension == " extension.2 "
assert updated_stream_info . filename == " filename.2 "
assert updated_stream_info . mimetype == " mimetype.3 "
assert updated_stream_info . charset == " charset.3 "
assert updated_stream_info . local_path == " local_path.1 "
assert updated_stream_info . url == " url.1 "
# Check multiple StreamInfo objects
updated_stream_info = stream_info_original . copy_and_update (
StreamInfo ( extension = " extension.4 " , filename = " filename.5 " ) ,
StreamInfo ( mimetype = " mimetype.6 " , charset = " charset.7 " ) ,
)
assert updated_stream_info . extension == " extension.4 "
assert updated_stream_info . filename == " filename.5 "
assert updated_stream_info . mimetype == " mimetype.6 "
assert updated_stream_info . charset == " charset.7 "
assert updated_stream_info . local_path == " local_path.1 "
assert updated_stream_info . url == " url.1 "
2025-03-25 04:43:04 +00:00
def test_data_uris ( ) - > None :
# Test basic parsing of data URIs
data_uri = " data:text/plain;base64,SGVsbG8sIFdvcmxkIQ== "
mime_type , attributes , data = parse_data_uri ( data_uri )
assert mime_type == " text/plain "
assert len ( attributes ) == 0
assert data == b " Hello, World! "
data_uri = " data:base64,SGVsbG8sIFdvcmxkIQ== "
mime_type , attributes , data = parse_data_uri ( data_uri )
assert mime_type is None
assert len ( attributes ) == 0
assert data == b " Hello, World! "
data_uri = " data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ== "
mime_type , attributes , data = parse_data_uri ( data_uri )
assert mime_type == " text/plain "
assert len ( attributes ) == 1
assert attributes [ " charset " ] == " utf-8 "
assert data == b " Hello, World! "
data_uri = " data:,Hello % 2C % 20World % 21 "
mime_type , attributes , data = parse_data_uri ( data_uri )
assert mime_type is None
assert len ( attributes ) == 0
assert data == b " Hello, World! "
data_uri = " data:text/plain,Hello % 2C % 20World % 21 "
mime_type , attributes , data = parse_data_uri ( data_uri )
assert mime_type == " text/plain "
assert len ( attributes ) == 0
assert data == b " Hello, World! "
data_uri = " data:text/plain;charset=utf-8,Hello % 2C % 20World % 21 "
mime_type , attributes , data = parse_data_uri ( data_uri )
assert mime_type == " text/plain "
assert len ( attributes ) == 1
assert attributes [ " charset " ] == " utf-8 "
assert data == b " Hello, World! "
def test_file_uris ( ) - > None :
# Test file URI with an empty host
file_uri = " file:///path/to/file.txt "
netloc , path = file_uri_to_path ( file_uri )
assert netloc is None
assert path == " /path/to/file.txt "
# Test file URI with no host
file_uri = " file:/path/to/file.txt "
netloc , path = file_uri_to_path ( file_uri )
assert netloc is None
assert path == " /path/to/file.txt "
# Test file URI with localhost
file_uri = " file://localhost/path/to/file.txt "
netloc , path = file_uri_to_path ( file_uri )
assert netloc == " localhost "
assert path == " /path/to/file.txt "
# Test file URI with query parameters
file_uri = " file:///path/to/file.txt?param=value "
netloc , path = file_uri_to_path ( file_uri )
assert netloc is None
assert path == " /path/to/file.txt "
# Test file URI with fragment
file_uri = " file:///path/to/file.txt#fragment "
netloc , path = file_uri_to_path ( file_uri )
assert netloc is None
assert path == " /path/to/file.txt "
2025-03-12 18:08:06 +00:00
def test_docx_comments ( ) - > None :
2024-11-13 21:00:01 +00:00
markitdown = MarkItDown ( )
2024-12-15 15:23:57 +00:00
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown ( style_map = " comment-reference => " )
result = markitdown_with_style_map . convert (
os . path . join ( TEST_FILES_DIR , " test_with_comment.docx " )
)
2024-12-20 19:42:32 +00:00
validate_strings ( result , DOCX_COMMENT_TEST_STRINGS )
2024-12-15 14:59:21 +00:00
2024-11-13 21:00:01 +00:00
2025-03-28 22:36:38 +00:00
def test_docx_equations ( ) - > None :
markitdown = MarkItDown ( )
docx_file = os . path . join ( TEST_FILES_DIR , " equations.docx " )
result = markitdown . convert ( docx_file )
# Check for inline equation m=1 (wrapped with single $) is present
assert " $m=1$ " in result . text_content , " Inline equation $m=1$ not found "
# Find block equations wrapped with double $$ and check if they are present
block_equations = re . findall ( r " \ $ \ $(.+?) \ $ \ $ " , result . text_content )
assert block_equations , " No block equations found in the document. "
2025-03-12 18:08:06 +00:00
def test_input_as_strings ( ) - > None :
markitdown = MarkItDown ( )
2025-03-06 05:16:55 +00:00
# Test input from a stream
input_data = b " <html><body><h1>Test</h1></body></html> "
result = markitdown . convert_stream ( io . BytesIO ( input_data ) )
assert " # Test " in result . text_content
2025-01-03 22:34:33 +00:00
# Test input with leading blank characters
input_data = b " \n \n \n <html><body><h1>Test</h1></body></html> "
2025-01-04 00:03:11 +00:00
result = markitdown . convert_stream ( io . BytesIO ( input_data ) )
2025-01-03 22:34:33 +00:00
assert " # Test " in result . text_content
2024-11-13 21:00:01 +00:00
2025-03-12 18:08:06 +00:00
@pytest.mark.skipif (
skip_remote ,
reason = " do not run tests that query external urls " ,
)
def test_markitdown_remote ( ) - > None :
2025-03-06 05:16:55 +00:00
markitdown = MarkItDown ( )
2025-03-12 18:08:06 +00:00
# By URL
result = markitdown . convert ( PDF_TEST_URL )
for test_string in PDF_TEST_STRINGS :
assert test_string in result . text_content
2025-03-06 05:16:55 +00:00
2025-03-12 18:08:06 +00:00
# Youtube
result = markitdown . convert ( YOUTUBE_TEST_URL )
for test_string in YOUTUBE_TEST_STRINGS :
assert test_string in result . text_content
2025-03-06 05:16:55 +00:00
@pytest.mark.skipif (
skip_remote ,
reason = " do not run remotely run speech transcription tests " ,
)
def test_speech_transcription ( ) - > None :
markitdown = MarkItDown ( )
# Test WAV files, MP3 and M4A files
for file_name in [ " test.wav " , " test.mp3 " , " test.m4a " ] :
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , file_name ) )
result_lower = result . text_content . lower ( )
assert (
( " 1 " in result_lower or " one " in result_lower )
and ( " 2 " in result_lower or " two " in result_lower )
and ( " 3 " in result_lower or " three " in result_lower )
and ( " 4 " in result_lower or " four " in result_lower )
and ( " 5 " in result_lower or " five " in result_lower )
)
2025-03-01 00:07:47 +00:00
def test_exceptions ( ) - > None :
# Check that an exception is raised when trying to convert an unsupported format
markitdown = MarkItDown ( )
with pytest . raises ( UnsupportedFormatException ) :
markitdown . convert ( os . path . join ( TEST_FILES_DIR , " random.bin " ) )
# Check that an exception is raised when trying to convert a file that is corrupted
with pytest . raises ( FileConversionException ) as exc_info :
markitdown . convert (
os . path . join ( TEST_FILES_DIR , " random.bin " ) , file_extension = " .pptx "
)
assert len ( exc_info . value . attempts ) == 1
assert type ( exc_info . value . attempts [ 0 ] . converter ) . __name__ == " PptxConverter "
2024-11-13 21:00:01 +00:00
@pytest.mark.skipif (
skip_exiftool ,
reason = " do not run if exiftool is not installed " ,
)
def test_markitdown_exiftool ( ) - > None :
2025-01-06 20:43:47 +00:00
which_exiftool = shutil . which ( " exiftool " )
2025-03-06 05:16:55 +00:00
assert which_exiftool is not None
# Test explicitly setting the location of exiftool
2025-01-06 20:43:47 +00:00
markitdown = MarkItDown ( exiftool_path = which_exiftool )
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.jpg " ) )
for key in JPG_TEST_EXIFTOOL :
target = f " { key } : { JPG_TEST_EXIFTOOL [ key ] } "
assert target in result . text_content
# Test setting the exiftool path through an environment variable
os . environ [ " EXIFTOOL_PATH " ] = which_exiftool
markitdown = MarkItDown ( )
2024-11-13 21:00:01 +00:00
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.jpg " ) )
for key in JPG_TEST_EXIFTOOL :
target = f " { key } : { JPG_TEST_EXIFTOOL [ key ] } "
assert target in result . text_content
2025-03-06 05:16:55 +00:00
# Test some other media types
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.mp3 " ) )
for key in MP3_TEST_EXIFTOOL :
target = f " { key } : { MP3_TEST_EXIFTOOL [ key ] } "
assert target in result . text_content
2024-11-13 21:00:01 +00:00
2024-12-17 20:13:19 +00:00
@pytest.mark.skipif (
skip_llm ,
reason = " do not run llm tests without a key " ,
)
def test_markitdown_llm ( ) - > None :
client = openai . OpenAI ( )
markitdown = MarkItDown ( llm_client = client , llm_model = " gpt-4o " )
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test_llm.jpg " ) )
for test_string in LLM_TEST_STRINGS :
assert test_string in result . text_content
# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in [ " red " , " circle " , " blue " , " square " ] :
assert test_string in result . text_content . lower ( )
2025-03-06 05:16:55 +00:00
# Images embedded in PPTX files
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.pptx " ) )
# LLM Captions are included
for test_string in LLM_TEST_STRINGS :
assert test_string in result . text_content
# Standard alt text is included
validate_strings ( result , PPTX_TEST_STRINGS )
2024-12-17 20:13:19 +00:00
2024-11-13 21:00:01 +00:00
if __name__ == " __main__ " :
""" Runs this file ' s tests from the command line. """
2025-03-12 18:08:06 +00:00
for test in [
test_stream_info_operations ,
2025-03-25 04:43:04 +00:00
test_data_uris ,
test_file_uris ,
2025-03-12 18:08:06 +00:00
test_docx_comments ,
test_input_as_strings ,
test_markitdown_remote ,
test_speech_transcription ,
test_exceptions ,
test_markitdown_exiftool ,
test_markitdown_llm ,
] :
print ( f " Running { test . __name__ } ... " , end = " " )
test ( )
print ( " OK " )
2025-02-11 20:36:32 +00:00
print ( " All tests passed! " )