237 lines
No EOL
9.2 KiB
Python
237 lines
No EOL
9.2 KiB
Python
import os
|
|
import unittest
|
|
from pathlib import Path
|
|
from markitup import MarkItUp, Config
|
|
|
|
|
|
class TestMarkItUp(unittest.TestCase):
|
|
def setUp(self):
|
|
# Get the absolute path to the test_files directory
|
|
self.test_files_dir = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)), "test_files"
|
|
)
|
|
|
|
def test_plain_text_conversion(self):
|
|
"""Test converting a plain text file to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test.txt")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "text")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_docx_conversion(self):
|
|
"""Test converting a DOCX file to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test.docx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "docx")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_docx_with_comments_conversion(self):
|
|
"""Test converting a DOCX file with comments to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test_with_comment.docx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "docx")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_pdf_conversion(self):
|
|
"""Test converting a PDF file to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test.pdf")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "pdf")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_html_conversion(self):
|
|
"""Test converting HTML files to markdown."""
|
|
html_files = ["test_blog.html", "test_wikipedia.html", "test_serp.html"]
|
|
|
|
for html_file in html_files:
|
|
filepath = os.path.join(self.test_files_dir, html_file)
|
|
with self.subTest(file=html_file):
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "text")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_xlsx_conversion(self):
|
|
"""Test converting an XLSX file to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test.xlsx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "xlsx")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_xls_conversion(self):
|
|
"""Test converting an XLS file to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test.xls")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "xls")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_csv_conversion(self):
|
|
"""Test converting CSV files to markdown."""
|
|
csv_files = ["test.csv", "test_mskanji.csv"]
|
|
|
|
for csv_file in csv_files:
|
|
filepath = os.path.join(self.test_files_dir, csv_file)
|
|
with self.subTest(file=csv_file):
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "csv")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_pptx_conversion(self):
|
|
"""Test converting a PPTX file to markdown."""
|
|
filepath = os.path.join(self.test_files_dir, "test.pptx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "pptx")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_audio_conversion(self):
|
|
"""Test converting audio files to markdown."""
|
|
audio_files = ["test.mp3", "test.m4a"]
|
|
|
|
for audio_file in audio_files:
|
|
filepath = os.path.join(self.test_files_dir, audio_file)
|
|
with self.subTest(file=audio_file):
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "audio")
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_image_in_config(self):
|
|
"""Test with only image in modalities config."""
|
|
filepath = os.path.join(self.test_files_dir, "test.pdf")
|
|
|
|
with open(filepath, "rb") as f:
|
|
# Configure with only image modality
|
|
config = Config(modalities=["image"])
|
|
markitup = MarkItUp(config=config)
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "pdf")
|
|
# PDF might still include image references if there are images in the PDF
|
|
|
|
def test_audio_in_config(self):
|
|
"""Test with only audio in modalities config."""
|
|
filepath = os.path.join(self.test_files_dir, "test.docx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
# Configure with only audio modality
|
|
config = Config(modalities=["audio"])
|
|
markitup = MarkItUp(config=config)
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "docx")
|
|
# Should not have image tags in the result
|
|
|
|
def test_no_modalities_config(self):
|
|
"""Test with empty modalities config."""
|
|
filepath = os.path.join(self.test_files_dir, "test_with_comment.docx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
# Configure with no modalities
|
|
config = Config(modalities=[])
|
|
markitup = MarkItUp(config=config)
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, "docx")
|
|
# Should have text without image or audio references
|
|
|
|
def test_unsupported_format(self):
|
|
"""Test handling of an unsupported file format."""
|
|
filepath = os.path.join(self.test_files_dir, "random.bin")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
with self.assertRaises(Exception):
|
|
# Should raise an exception for unsupported format
|
|
markitup.convert(f)
|
|
|
|
def test_multiple_files_same_config(self):
|
|
"""Test converting multiple files with the same configuration."""
|
|
test_files = {
|
|
"test.txt": "text",
|
|
"test.docx": "docx",
|
|
"test.pdf": "pdf",
|
|
"test.xlsx": "xlsx"
|
|
}
|
|
|
|
# Create a single configuration to use for all conversions
|
|
config = Config(modalities=["image", "audio"])
|
|
markitup = MarkItUp(config=config)
|
|
|
|
for filename, expected_category in test_files.items():
|
|
filepath = os.path.join(self.test_files_dir, filename)
|
|
with self.subTest(file=filename):
|
|
with open(filepath, "rb") as f:
|
|
result, info = markitup.convert(f)
|
|
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(info.category, expected_category)
|
|
self.assertTrue(result.markdown, "Content should not be empty")
|
|
|
|
def test_to_llm_method(self):
|
|
"""Test the to_llm method of the conversion result."""
|
|
filepath = os.path.join(self.test_files_dir, "test.docx")
|
|
|
|
with open(filepath, "rb") as f:
|
|
markitup = MarkItUp()
|
|
result, info = markitup.convert(f)
|
|
|
|
# Call the to_llm method and check the result
|
|
llm_format = result.to_llm()
|
|
self.assertIsNotNone(llm_format)
|
|
self.assertIsInstance(llm_format, list)
|
|
|
|
# Check if there's at least one content element
|
|
if llm_format:
|
|
self.assertIn("type", llm_format[0])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main() |