diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index a0b2186..1cc8cad 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config from .converters import ( PlainTextConverter, HtmlConverter, + ImageConverter, PdfConverter, DocxConverter, XlsxConverter, @@ -58,11 +59,16 @@ class MarkItUp: return CsvConverter().convert(stream, stream_info), stream_info case "docx": return DocxConverter(config=self.config).convert(stream, stream_info), stream_info + case "image": + return ImageConverter(config=self.config).convert(stream, stream_info), stream_info case _: match stream_info.category: case "ppt": raise UnsupportedFormatException( ".ppt files are not supported, try .pptx instead") + case "doc": + raise UnsupportedFormatException( + ".doc files are not supported, try .docx instead") case "other": raise UnsupportedFormatException( f"{stream_info.magic_type} files are not supported") @@ -84,7 +90,10 @@ class MarkItUp: # Determine file category based on magic_type if magic_type.startswith("image/"): - category = "image" + if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]: + category = "image" + else: + category = "other" elif magic_type.startswith("audio/"): category = "audio" elif magic_type.startswith("video/"): diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py index 9cbe1c9..36adf09 100644 --- a/packages/markitup/src/markitup/_schemas.py +++ b/packages/markitup/src/markitup/_schemas.py @@ -13,3 +13,4 @@ class Config: modalities: List[Literal["image", "audio"]] = field( default_factory=lambda: ["image", "audio"] ) + image_use_webp: bool = True # TODO: support files contains images diff --git a/packages/markitup/src/markitup/converters/__init__.py b/packages/markitup/src/markitup/converters/__init__.py index 5aea2af..bec2517 100644 --- a/packages/markitup/src/markitup/converters/__init__.py +++ b/packages/markitup/src/markitup/converters/__init__.py @@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._audio_converter import AudioConverter from ._csv_converter import CsvConverter +from ._image_converter import ImageConverter from ._markdownify import _CustomMarkdownify __all__ = [ @@ -19,6 +20,7 @@ __all__ = [ "_CustomMarkdownify", "WikipediaConverter", "YouTubeConverter", + "ImageConverter" "IpynbConverter", "BingSerpConverter", "PdfConverter", diff --git a/packages/markitup/src/markitup/converters/_image_converter.py b/packages/markitup/src/markitup/converters/_image_converter.py new file mode 100644 index 0000000..de05512 --- /dev/null +++ b/packages/markitup/src/markitup/converters/_image_converter.py @@ -0,0 +1,46 @@ +from typing import BinaryIO, Any +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._schemas import StreamInfo, Config +import base64 + + +class ImageConverter(DocumentConverter): + """ + Converts image files to markdown with embedded base64 image. + """ + + def __init__(self, config: Config): + self.config = config + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Read the image data + image_bytes = file_stream.read() + + # Determine image extension from magic_type + image_ext = "png" # Default extension + match stream_info.magic_type: + case "image/jpeg" | "image/jpg": + image_ext = "jpeg" + case "image/png": + image_ext = "png" + case "image/webp": + image_ext = "webp" + + if 'image' in self.config.modalities: + img_base64 = base64.b64encode(image_bytes).decode('utf-8') + + # Create markdown with embedded image + markdown_content = f"![Image](data:image/{image_ext};base64,{img_base64})\n\n" + + return DocumentConverterResult( + markdown=markdown_content, + ) + else: + return DocumentConverterResult( + markdown="No Image read as the supported modalities do not include 'image'", + ) \ No newline at end of file diff --git a/packages/markitup/tests/test_files/test.png b/packages/markitup/tests/test_files/test.png new file mode 100644 index 0000000..55efaf8 Binary files /dev/null and b/packages/markitup/tests/test_files/test.png differ diff --git a/packages/markitup/tests/test_files/test.webp b/packages/markitup/tests/test_files/test.webp new file mode 100644 index 0000000..122741b Binary files /dev/null and b/packages/markitup/tests/test_files/test.webp differ diff --git a/packages/markitup/tests/test_files/test_llm.jpg b/packages/markitup/tests/test_files/test_llm.jpg deleted file mode 100644 index 1f358fe..0000000 Binary files a/packages/markitup/tests/test_files/test_llm.jpg and /dev/null differ