diff --git a/README.md b/README.md index 02c5d91..da28041 100644 --- a/README.md +++ b/README.md @@ -16,4 +16,12 @@ While markitdown is a useful tool, its returned content is too text-focused, whi - PDF files - Plain text files - Returns OpenAI compatible response, which can be used by most LLM clients -- Supports command line usage \ No newline at end of file +- Supports command line usage + +## Installation + +Install directly from GitHub: + +```bash +pip install git+https://github.com/pathintegral-institute/markitup.git +``` \ No newline at end of file diff --git a/packages/markitup/pyproject.toml b/packages/markitup/pyproject.toml index 5bcd9af..ac2b95b 100644 --- a/packages/markitup/pyproject.toml +++ b/packages/markitup/pyproject.toml @@ -37,6 +37,8 @@ dependencies = [ "pydub", "SpeechRecognition", "pymupdf>=1.25.5", + "openpyxl>=3.1.5", + "xlrd>=2.0.1", ] [tool.hatch.version] diff --git a/packages/markitup/tests/__init__.py b/packages/markitup/tests/__init__.py new file mode 100644 index 0000000..93a2d4b --- /dev/null +++ b/packages/markitup/tests/__init__.py @@ -0,0 +1 @@ +# Test package initialization \ No newline at end of file diff --git a/packages/markitup/tests/test_markitup.py b/packages/markitup/tests/test_markitup.py new file mode 100644 index 0000000..c46a8b5 --- /dev/null +++ b/packages/markitup/tests/test_markitup.py @@ -0,0 +1,237 @@ +import os +import unittest +from pathlib import Path +from markitup import MarkItUp, Config + + +class TestMarkItUp(unittest.TestCase): + def setUp(self): + # Get the absolute path to the test_files directory + self.test_files_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_files" + ) + + def test_plain_text_conversion(self): + """Test converting a plain text file to markdown.""" + filepath = os.path.join(self.test_files_dir, "test.txt") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "text") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_docx_conversion(self): + """Test converting a DOCX file to markdown.""" + filepath = os.path.join(self.test_files_dir, "test.docx") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "docx") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_docx_with_comments_conversion(self): + """Test converting a DOCX file with comments to markdown.""" + filepath = os.path.join(self.test_files_dir, "test_with_comment.docx") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "docx") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_pdf_conversion(self): + """Test converting a PDF file to markdown.""" + filepath = os.path.join(self.test_files_dir, "test.pdf") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "pdf") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_html_conversion(self): + """Test converting HTML files to markdown.""" + html_files = ["test_blog.html", "test_wikipedia.html", "test_serp.html"] + + for html_file in html_files: + filepath = os.path.join(self.test_files_dir, html_file) + with self.subTest(file=html_file): + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "text") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_xlsx_conversion(self): + """Test converting an XLSX file to markdown.""" + filepath = os.path.join(self.test_files_dir, "test.xlsx") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "xlsx") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_xls_conversion(self): + """Test converting an XLS file to markdown.""" + filepath = os.path.join(self.test_files_dir, "test.xls") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "xls") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_csv_conversion(self): + """Test converting CSV files to markdown.""" + csv_files = ["test.csv", "test_mskanji.csv"] + + for csv_file in csv_files: + filepath = os.path.join(self.test_files_dir, csv_file) + with self.subTest(file=csv_file): + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "csv") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_pptx_conversion(self): + """Test converting a PPTX file to markdown.""" + filepath = os.path.join(self.test_files_dir, "test.pptx") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "pptx") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_audio_conversion(self): + """Test converting audio files to markdown.""" + audio_files = ["test.mp3", "test.m4a"] + + for audio_file in audio_files: + filepath = os.path.join(self.test_files_dir, audio_file) + with self.subTest(file=audio_file): + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "audio") + self.assertTrue(result.markdown, "Content should not be empty") + + def test_image_in_config(self): + """Test with only image in modalities config.""" + filepath = os.path.join(self.test_files_dir, "test.pdf") + + with open(filepath, "rb") as f: + # Configure with only image modality + config = Config(modalities=["image"]) + markitup = MarkItUp(config=config) + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "pdf") + # PDF might still include image references if there are images in the PDF + + def test_audio_in_config(self): + """Test with only audio in modalities config.""" + filepath = os.path.join(self.test_files_dir, "test.docx") + + with open(filepath, "rb") as f: + # Configure with only audio modality + config = Config(modalities=["audio"]) + markitup = MarkItUp(config=config) + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "docx") + # Should not have image tags in the result + + def test_no_modalities_config(self): + """Test with empty modalities config.""" + filepath = os.path.join(self.test_files_dir, "test_with_comment.docx") + + with open(filepath, "rb") as f: + # Configure with no modalities + config = Config(modalities=[]) + markitup = MarkItUp(config=config) + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, "docx") + # Should have text without image or audio references + + def test_unsupported_format(self): + """Test handling of an unsupported file format.""" + filepath = os.path.join(self.test_files_dir, "random.bin") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + with self.assertRaises(Exception): + # Should raise an exception for unsupported format + markitup.convert(f) + + def test_multiple_files_same_config(self): + """Test converting multiple files with the same configuration.""" + test_files = { + "test.txt": "text", + "test.docx": "docx", + "test.pdf": "pdf", + "test.xlsx": "xlsx" + } + + # Create a single configuration to use for all conversions + config = Config(modalities=["image", "audio"]) + markitup = MarkItUp(config=config) + + for filename, expected_category in test_files.items(): + filepath = os.path.join(self.test_files_dir, filename) + with self.subTest(file=filename): + with open(filepath, "rb") as f: + result, info = markitup.convert(f) + + self.assertIsNotNone(result) + self.assertEqual(info.category, expected_category) + self.assertTrue(result.markdown, "Content should not be empty") + + def test_to_llm_method(self): + """Test the to_llm method of the conversion result.""" + filepath = os.path.join(self.test_files_dir, "test.docx") + + with open(filepath, "rb") as f: + markitup = MarkItUp() + result, info = markitup.convert(f) + + # Call the to_llm method and check the result + llm_format = result.to_llm() + self.assertIsNotNone(llm_format) + self.assertIsInstance(llm_format, list) + + # Check if there's at least one content element + if llm_format: + self.assertIn("type", llm_format[0]) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/packages/markitup/uv.lock b/packages/markitup/uv.lock index 5d9ff8d..56bd2e1 100644 --- a/packages/markitup/uv.lock +++ b/packages/markitup/uv.lock @@ -173,6 +173,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, +] + [[package]] name = "flatbuffers" version = "25.2.10" @@ -339,6 +348,7 @@ dependencies = [ { name = "mammoth" }, { name = "markdownify" }, { name = "olefile" }, + { name = "openpyxl" }, { name = "pandas" }, { name = "pydub" }, { name = "pymupdf" }, @@ -346,6 +356,7 @@ dependencies = [ { name = "python-pptx" }, { name = "requests" }, { name = "speechrecognition" }, + { name = "xlrd" }, ] [package.metadata] @@ -357,6 +368,7 @@ requires-dist = [ { name = "mammoth" }, { name = "markdownify" }, { name = "olefile" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas" }, { name = "pydub" }, { name = "pymupdf", specifier = ">=1.25.5" }, @@ -364,6 +376,7 @@ requires-dist = [ { name = "python-pptx" }, { name = "requests" }, { name = "speechrecognition" }, + { name = "xlrd", specifier = ">=2.0.1" }, ] [[package]] @@ -479,6 +492,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, +] + [[package]] name = "packaging" version = "25.0" @@ -822,6 +847,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 }, ] +[[package]] +name = "xlrd" +version = "2.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f908304375bd43f5ed7a4c28a370dc9122c565423e6b44/xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88", size = 100259 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 }, +] + [[package]] name = "xlsxwriter" version = "3.2.3"