diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py index 7cc818a..f8e02a3 100644 --- a/packages/markitup/src/markitup/_base_converter.py +++ b/packages/markitup/src/markitup/_base_converter.py @@ -104,6 +104,7 @@ class DocumentConverterResult: "text": text_chunk }) if self.audio_stream: + print('hello') audio_b64 = base64.b64encode( self.audio_stream.read()).decode('utf-8') content.append({ diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index c392ce4..2102fd4 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -3,7 +3,7 @@ from pathlib import Path from urllib.parse import urlparse from warnings import warn import magic - +import mimetypes from ._schemas import StreamInfo, Config from .converters import ( @@ -38,13 +38,13 @@ class MarkItUp: ): self.config = config - def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]: - stream_info: StreamInfo = self._get_stream_info(stream) + def convert(self, stream: BinaryIO, file_name: str) -> Dict[DocumentConverterResult, StreamInfo]: + stream_info: StreamInfo = self._get_stream_info(stream, file_name) # Deal with unsupported file types try: match stream_info.category: case "text": - return PlainTextConverter().convert(stream, stream_info), stream_info + return PlainTextConverter(config=self.config).convert(stream, stream_info), stream_info case "pptx": return PptxConverter(config=self.config).convert(stream, stream_info), stream_info case "pdf": @@ -78,10 +78,7 @@ class MarkItUp: raise FileConversionException( f"Failed to convert file of type {stream_info.magic_type}") - def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: - original_position = byte_stream.tell() - - # Reset stream position to beginning + def _get_stream_info(self, byte_stream: BinaryIO, filename: str) -> StreamInfo: byte_stream.seek(0) # Get file content for analysis @@ -89,6 +86,10 @@ class MarkItUp: # Use python-magic to determine file type based on content magic_type = magic.from_buffer(file_content, mime=True) + if magic_type == "application/octet-stream": + guessed_type, _ = mimetypes.guess_type(filename) + if guessed_type: + magic_type = guessed_type # Determine file category based on magic_type if magic_type.startswith("image/"): @@ -96,7 +97,7 @@ class MarkItUp: category = "image" else: category = "other" - elif magic_type.startswith("audio/"): + elif magic_type ==("audio/mpeg"): category = "audio" elif magic_type.startswith("video/"): category = "video" @@ -126,5 +127,5 @@ class MarkItUp: else: category = "other" - byte_stream.seek(original_position) + byte_stream.seek(0) return StreamInfo(magic_type=magic_type, category=category) diff --git a/packages/markitup/src/markitup/converters/_plain_text_converter.py b/packages/markitup/src/markitup/converters/_plain_text_converter.py index 740a4f7..a4ea43a 100644 --- a/packages/markitup/src/markitup/converters/_plain_text_converter.py +++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py @@ -1,17 +1,16 @@ from typing import BinaryIO, Any from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" - - def convert( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> DocumentConverterResult: - text_content = str(from_bytes(file_stream.read()).best()) + def __init__(self, config: Config): + self.config = config + + def convert(self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any) -> DocumentConverterResult: + content = file_stream.read() + text_content = str(from_bytes(content).best()) + return DocumentConverterResult(markdown=text_content) diff --git a/packages/markitup/tests/test_markitup.py b/packages/markitup/tests/test_markitup.py index c46a8b5..cbef099 100644 --- a/packages/markitup/tests/test_markitup.py +++ b/packages/markitup/tests/test_markitup.py @@ -2,153 +2,124 @@ import os import unittest from pathlib import Path from markitup import MarkItUp, Config +from markitup.converter_utils.utils import read_files_to_bytestreams + +fs = read_files_to_bytestreams('packages/markitup/tests/test_files') class TestMarkItUp(unittest.TestCase): def setUp(self): - # Get the absolute path to the test_files directory - self.test_files_dir = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "test_files" - ) + print("Setting up test environment") + print(fs) def test_plain_text_conversion(self): """Test converting a plain text file to markdown.""" - filepath = os.path.join(self.test_files_dir, "test.txt") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) - + markitup = MarkItUp() + # fs['test.txt'].seek(0) + result, info = markitup.convert(fs['test.txt'], 'test.txt') self.assertIsNotNone(result) self.assertEqual(info.category, "text") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_docx_conversion(self): """Test converting a DOCX file to markdown.""" - filepath = os.path.join(self.test_files_dir, "test.docx") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test.docx'], 'test.docx') self.assertIsNotNone(result) self.assertEqual(info.category, "docx") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_docx_with_comments_conversion(self): """Test converting a DOCX file with comments to markdown.""" - filepath = os.path.join(self.test_files_dir, "test_with_comment.docx") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test_with_comment.docx'], 'test_with_comment.docx') self.assertIsNotNone(result) self.assertEqual(info.category, "docx") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_pdf_conversion(self): """Test converting a PDF file to markdown.""" - filepath = os.path.join(self.test_files_dir, "test.pdf") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test.pdf'], 'test.pdf') self.assertIsNotNone(result) self.assertEqual(info.category, "pdf") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_html_conversion(self): """Test converting HTML files to markdown.""" html_files = ["test_blog.html", "test_wikipedia.html", "test_serp.html"] for html_file in html_files: - filepath = os.path.join(self.test_files_dir, html_file) with self.subTest(file=html_file): - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs[html_file], html_file) self.assertIsNotNone(result) - self.assertEqual(info.category, "text") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertEqual(info.category, "html") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_xlsx_conversion(self): """Test converting an XLSX file to markdown.""" - filepath = os.path.join(self.test_files_dir, "test.xlsx") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test.xlsx'], 'test.xlsx') self.assertIsNotNone(result) self.assertEqual(info.category, "xlsx") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_xls_conversion(self): """Test converting an XLS file to markdown.""" - filepath = os.path.join(self.test_files_dir, "test.xls") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test.xls'], 'test.xls') self.assertIsNotNone(result) self.assertEqual(info.category, "xls") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_csv_conversion(self): """Test converting CSV files to markdown.""" csv_files = ["test.csv", "test_mskanji.csv"] for csv_file in csv_files: - filepath = os.path.join(self.test_files_dir, csv_file) with self.subTest(file=csv_file): - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs[csv_file], csv_file) self.assertIsNotNone(result) self.assertEqual(info.category, "csv") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_pptx_conversion(self): """Test converting a PPTX file to markdown.""" - filepath = os.path.join(self.test_files_dir, "test.pptx") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test.pptx'], 'test.pptx') self.assertIsNotNone(result) self.assertEqual(info.category, "pptx") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_audio_conversion(self): """Test converting audio files to markdown.""" - audio_files = ["test.mp3", "test.m4a"] + audio_files = ["test.mp3"] for audio_file in audio_files: - filepath = os.path.join(self.test_files_dir, audio_file) with self.subTest(file=audio_file): - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp(config=Config(modalities=["audio"])) + result, info = markitup.convert(fs[audio_file], audio_file) self.assertIsNotNone(result) self.assertEqual(info.category, "audio") - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_image_in_config(self): """Test with only image in modalities config.""" - filepath = os.path.join(self.test_files_dir, "test.pdf") - - with open(filepath, "rb") as f: - # Configure with only image modality - config = Config(modalities=["image"]) - markitup = MarkItUp(config=config) - result, info = markitup.convert(f) + # Configure with only image modality + config = Config(modalities=["image"]) + markitup = MarkItUp(config=config) + result, info = markitup.convert(fs['test.pdf'], 'test.pdf') self.assertIsNotNone(result) self.assertEqual(info.category, "pdf") @@ -156,13 +127,10 @@ class TestMarkItUp(unittest.TestCase): def test_audio_in_config(self): """Test with only audio in modalities config.""" - filepath = os.path.join(self.test_files_dir, "test.docx") - - with open(filepath, "rb") as f: - # Configure with only audio modality - config = Config(modalities=["audio"]) - markitup = MarkItUp(config=config) - result, info = markitup.convert(f) + # Configure with only audio modality + config = Config(modalities=["audio"]) + markitup = MarkItUp(config=config) + result, info = markitup.convert(fs['test.docx'], 'test.docx') self.assertIsNotNone(result) self.assertEqual(info.category, "docx") @@ -170,13 +138,10 @@ class TestMarkItUp(unittest.TestCase): def test_no_modalities_config(self): """Test with empty modalities config.""" - filepath = os.path.join(self.test_files_dir, "test_with_comment.docx") - - with open(filepath, "rb") as f: - # Configure with no modalities - config = Config(modalities=[]) - markitup = MarkItUp(config=config) - result, info = markitup.convert(f) + # Configure with no modalities + config = Config(modalities=[]) + markitup = MarkItUp(config=config) + result, info = markitup.convert(fs['test_with_comment.docx'], 'test_with_comment.docx') self.assertIsNotNone(result) self.assertEqual(info.category, "docx") @@ -184,13 +149,10 @@ class TestMarkItUp(unittest.TestCase): def test_unsupported_format(self): """Test handling of an unsupported file format.""" - filepath = os.path.join(self.test_files_dir, "random.bin") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - with self.assertRaises(Exception): - # Should raise an exception for unsupported format - markitup.convert(f) + markitup = MarkItUp() + with self.assertRaises(Exception): + # Should raise an exception for unsupported format + markitup.convert(fs['random.bin'], 'random.bin') def test_multiple_files_same_config(self): """Test converting multiple files with the same configuration.""" @@ -206,22 +168,17 @@ class TestMarkItUp(unittest.TestCase): markitup = MarkItUp(config=config) for filename, expected_category in test_files.items(): - filepath = os.path.join(self.test_files_dir, filename) with self.subTest(file=filename): - with open(filepath, "rb") as f: - result, info = markitup.convert(f) + result, info = markitup.convert(fs[filename], filename) self.assertIsNotNone(result) self.assertEqual(info.category, expected_category) - self.assertTrue(result.markdown, "Content should not be empty") + self.assertTrue(result.to_llm(), "Content should not be empty") def test_to_llm_method(self): """Test the to_llm method of the conversion result.""" - filepath = os.path.join(self.test_files_dir, "test.docx") - - with open(filepath, "rb") as f: - markitup = MarkItUp() - result, info = markitup.convert(f) + markitup = MarkItUp() + result, info = markitup.convert(fs['test.docx'], 'test.docx') # Call the to_llm method and check the result llm_format = result.to_llm()