markitdown/packages/markitup/tests/test_markitup.py

194 lines
7.7 KiB
Python
Raw Normal View History

2025-04-23 07:18:35 +00:00
import os
import unittest
from pathlib import Path
from markitup import MarkItUp, Config
2025-04-23 08:55:36 +00:00
from markitup.converter_utils.utils import read_files_to_bytestreams
fs = read_files_to_bytestreams('packages/markitup/tests/test_files')
2025-04-23 07:18:35 +00:00
class TestMarkItUp(unittest.TestCase):
def setUp(self):
2025-04-23 08:55:36 +00:00
print("Setting up test environment")
print(fs)
2025-04-23 07:18:35 +00:00
def test_plain_text_conversion(self):
"""Test converting a plain text file to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
# fs['test.txt'].seek(0)
result, info = markitup.convert(fs['test.txt'], 'test.txt')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "text")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_docx_conversion(self):
"""Test converting a DOCX file to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test.docx'], 'test.docx')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_docx_with_comments_conversion(self):
"""Test converting a DOCX file with comments to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test_with_comment.docx'], 'test_with_comment.docx')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_pdf_conversion(self):
"""Test converting a PDF file to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test.pdf'], 'test.pdf')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "pdf")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_html_conversion(self):
"""Test converting HTML files to markdown."""
html_files = ["test_blog.html", "test_wikipedia.html", "test_serp.html"]
for html_file in html_files:
with self.subTest(file=html_file):
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs[html_file], html_file)
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
2025-04-23 08:55:36 +00:00
self.assertEqual(info.category, "html")
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_xlsx_conversion(self):
"""Test converting an XLSX file to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test.xlsx'], 'test.xlsx')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "xlsx")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_xls_conversion(self):
"""Test converting an XLS file to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test.xls'], 'test.xls')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "xls")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_csv_conversion(self):
"""Test converting CSV files to markdown."""
csv_files = ["test.csv", "test_mskanji.csv"]
for csv_file in csv_files:
with self.subTest(file=csv_file):
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs[csv_file], csv_file)
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "csv")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_pptx_conversion(self):
"""Test converting a PPTX file to markdown."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test.pptx'], 'test.pptx')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "pptx")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_audio_conversion(self):
"""Test converting audio files to markdown."""
2025-04-23 08:55:36 +00:00
audio_files = ["test.mp3"]
2025-04-23 07:18:35 +00:00
for audio_file in audio_files:
with self.subTest(file=audio_file):
2025-04-23 08:55:36 +00:00
markitup = MarkItUp(config=Config(modalities=["audio"]))
result, info = markitup.convert(fs[audio_file], audio_file)
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "audio")
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_image_in_config(self):
"""Test with only image in modalities config."""
2025-04-23 08:55:36 +00:00
# Configure with only image modality
config = Config(modalities=["image"])
markitup = MarkItUp(config=config)
result, info = markitup.convert(fs['test.pdf'], 'test.pdf')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "pdf")
# PDF might still include image references if there are images in the PDF
def test_audio_in_config(self):
"""Test with only audio in modalities config."""
2025-04-23 08:55:36 +00:00
# Configure with only audio modality
config = Config(modalities=["audio"])
markitup = MarkItUp(config=config)
result, info = markitup.convert(fs['test.docx'], 'test.docx')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
# Should not have image tags in the result
def test_no_modalities_config(self):
"""Test with empty modalities config."""
2025-04-23 08:55:36 +00:00
# Configure with no modalities
config = Config(modalities=[])
markitup = MarkItUp(config=config)
result, info = markitup.convert(fs['test_with_comment.docx'], 'test_with_comment.docx')
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
# Should have text without image or audio references
def test_unsupported_format(self):
"""Test handling of an unsupported file format."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
with self.assertRaises(Exception):
# Should raise an exception for unsupported format
markitup.convert(fs['random.bin'], 'random.bin')
2025-04-23 07:18:35 +00:00
def test_multiple_files_same_config(self):
"""Test converting multiple files with the same configuration."""
test_files = {
"test.txt": "text",
"test.docx": "docx",
"test.pdf": "pdf",
"test.xlsx": "xlsx"
}
# Create a single configuration to use for all conversions
config = Config(modalities=["image", "audio"])
markitup = MarkItUp(config=config)
for filename, expected_category in test_files.items():
with self.subTest(file=filename):
2025-04-23 08:55:36 +00:00
result, info = markitup.convert(fs[filename], filename)
2025-04-23 07:18:35 +00:00
self.assertIsNotNone(result)
self.assertEqual(info.category, expected_category)
2025-04-23 08:55:36 +00:00
self.assertTrue(result.to_llm(), "Content should not be empty")
2025-04-23 07:18:35 +00:00
def test_to_llm_method(self):
"""Test the to_llm method of the conversion result."""
2025-04-23 08:55:36 +00:00
markitup = MarkItUp()
result, info = markitup.convert(fs['test.docx'], 'test.docx')
2025-04-23 07:18:35 +00:00
# Call the to_llm method and check the result
llm_format = result.to_llm()
self.assertIsNotNone(llm_format)
self.assertIsInstance(llm_format, list)
# Check if there's at least one content element
if llm_format:
self.assertIn("type", llm_format[0])
if __name__ == "__main__":
unittest.main()