add readme and test

This commit is contained in:
rong-xyz 2025-04-23 07:18:35 +00:00
parent 8527b09e3f
commit ce9cfff3bf
5 changed files with 283 additions and 1 deletions

View file

@ -16,4 +16,12 @@ While markitdown is a useful tool, its returned content is too text-focused, whi
- PDF files
- Plain text files
- Returns OpenAI compatible response, which can be used by most LLM clients
- Supports command line usage
- Supports command line usage
## Installation
Install directly from GitHub:
```bash
pip install git+https://github.com/pathintegral-institute/markitup.git
```

View file

@ -37,6 +37,8 @@ dependencies = [
"pydub",
"SpeechRecognition",
"pymupdf>=1.25.5",
"openpyxl>=3.1.5",
"xlrd>=2.0.1",
]
[tool.hatch.version]

View file

@ -0,0 +1 @@
# Test package initialization

View file

@ -0,0 +1,237 @@
import os
import unittest
from pathlib import Path
from markitup import MarkItUp, Config
class TestMarkItUp(unittest.TestCase):
def setUp(self):
# Get the absolute path to the test_files directory
self.test_files_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "test_files"
)
def test_plain_text_conversion(self):
"""Test converting a plain text file to markdown."""
filepath = os.path.join(self.test_files_dir, "test.txt")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "text")
self.assertTrue(result.markdown, "Content should not be empty")
def test_docx_conversion(self):
"""Test converting a DOCX file to markdown."""
filepath = os.path.join(self.test_files_dir, "test.docx")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
self.assertTrue(result.markdown, "Content should not be empty")
def test_docx_with_comments_conversion(self):
"""Test converting a DOCX file with comments to markdown."""
filepath = os.path.join(self.test_files_dir, "test_with_comment.docx")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
self.assertTrue(result.markdown, "Content should not be empty")
def test_pdf_conversion(self):
"""Test converting a PDF file to markdown."""
filepath = os.path.join(self.test_files_dir, "test.pdf")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "pdf")
self.assertTrue(result.markdown, "Content should not be empty")
def test_html_conversion(self):
"""Test converting HTML files to markdown."""
html_files = ["test_blog.html", "test_wikipedia.html", "test_serp.html"]
for html_file in html_files:
filepath = os.path.join(self.test_files_dir, html_file)
with self.subTest(file=html_file):
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "text")
self.assertTrue(result.markdown, "Content should not be empty")
def test_xlsx_conversion(self):
"""Test converting an XLSX file to markdown."""
filepath = os.path.join(self.test_files_dir, "test.xlsx")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "xlsx")
self.assertTrue(result.markdown, "Content should not be empty")
def test_xls_conversion(self):
"""Test converting an XLS file to markdown."""
filepath = os.path.join(self.test_files_dir, "test.xls")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "xls")
self.assertTrue(result.markdown, "Content should not be empty")
def test_csv_conversion(self):
"""Test converting CSV files to markdown."""
csv_files = ["test.csv", "test_mskanji.csv"]
for csv_file in csv_files:
filepath = os.path.join(self.test_files_dir, csv_file)
with self.subTest(file=csv_file):
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "csv")
self.assertTrue(result.markdown, "Content should not be empty")
def test_pptx_conversion(self):
"""Test converting a PPTX file to markdown."""
filepath = os.path.join(self.test_files_dir, "test.pptx")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "pptx")
self.assertTrue(result.markdown, "Content should not be empty")
def test_audio_conversion(self):
"""Test converting audio files to markdown."""
audio_files = ["test.mp3", "test.m4a"]
for audio_file in audio_files:
filepath = os.path.join(self.test_files_dir, audio_file)
with self.subTest(file=audio_file):
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "audio")
self.assertTrue(result.markdown, "Content should not be empty")
def test_image_in_config(self):
"""Test with only image in modalities config."""
filepath = os.path.join(self.test_files_dir, "test.pdf")
with open(filepath, "rb") as f:
# Configure with only image modality
config = Config(modalities=["image"])
markitup = MarkItUp(config=config)
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "pdf")
# PDF might still include image references if there are images in the PDF
def test_audio_in_config(self):
"""Test with only audio in modalities config."""
filepath = os.path.join(self.test_files_dir, "test.docx")
with open(filepath, "rb") as f:
# Configure with only audio modality
config = Config(modalities=["audio"])
markitup = MarkItUp(config=config)
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
# Should not have image tags in the result
def test_no_modalities_config(self):
"""Test with empty modalities config."""
filepath = os.path.join(self.test_files_dir, "test_with_comment.docx")
with open(filepath, "rb") as f:
# Configure with no modalities
config = Config(modalities=[])
markitup = MarkItUp(config=config)
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, "docx")
# Should have text without image or audio references
def test_unsupported_format(self):
"""Test handling of an unsupported file format."""
filepath = os.path.join(self.test_files_dir, "random.bin")
with open(filepath, "rb") as f:
markitup = MarkItUp()
with self.assertRaises(Exception):
# Should raise an exception for unsupported format
markitup.convert(f)
def test_multiple_files_same_config(self):
"""Test converting multiple files with the same configuration."""
test_files = {
"test.txt": "text",
"test.docx": "docx",
"test.pdf": "pdf",
"test.xlsx": "xlsx"
}
# Create a single configuration to use for all conversions
config = Config(modalities=["image", "audio"])
markitup = MarkItUp(config=config)
for filename, expected_category in test_files.items():
filepath = os.path.join(self.test_files_dir, filename)
with self.subTest(file=filename):
with open(filepath, "rb") as f:
result, info = markitup.convert(f)
self.assertIsNotNone(result)
self.assertEqual(info.category, expected_category)
self.assertTrue(result.markdown, "Content should not be empty")
def test_to_llm_method(self):
"""Test the to_llm method of the conversion result."""
filepath = os.path.join(self.test_files_dir, "test.docx")
with open(filepath, "rb") as f:
markitup = MarkItUp()
result, info = markitup.convert(f)
# Call the to_llm method and check the result
llm_format = result.to_llm()
self.assertIsNotNone(llm_format)
self.assertIsInstance(llm_format, list)
# Check if there's at least one content element
if llm_format:
self.assertIn("type", llm_format[0])
if __name__ == "__main__":
unittest.main()

View file

@ -173,6 +173,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
]
[[package]]
name = "flatbuffers"
version = "25.2.10"
@ -339,6 +348,7 @@ dependencies = [
{ name = "mammoth" },
{ name = "markdownify" },
{ name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "pydub" },
{ name = "pymupdf" },
@ -346,6 +356,7 @@ dependencies = [
{ name = "python-pptx" },
{ name = "requests" },
{ name = "speechrecognition" },
{ name = "xlrd" },
]
[package.metadata]
@ -357,6 +368,7 @@ requires-dist = [
{ name = "mammoth" },
{ name = "markdownify" },
{ name = "olefile" },
{ name = "openpyxl", specifier = ">=3.1.5" },
{ name = "pandas" },
{ name = "pydub" },
{ name = "pymupdf", specifier = ">=1.25.5" },
@ -364,6 +376,7 @@ requires-dist = [
{ name = "python-pptx" },
{ name = "requests" },
{ name = "speechrecognition" },
{ name = "xlrd", specifier = ">=2.0.1" },
]
[[package]]
@ -479,6 +492,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
]
[[package]]
name = "packaging"
version = "25.0"
@ -822,6 +847,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
]
[[package]]
name = "xlrd"
version = "2.0.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f908304375bd43f5ed7a4c28a370dc9122c565423e6b44/xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88", size = 100259 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 },
]
[[package]]
name = "xlsxwriter"
version = "3.2.3"