diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py index 0d5300d..7cc818a 100644 --- a/packages/markitup/src/markitup/_base_converter.py +++ b/packages/markitup/src/markitup/_base_converter.py @@ -2,9 +2,11 @@ import os import tempfile from warnings import warn from typing import Any, Union, BinaryIO, Optional, List, Dict -from ._schemas import StreamInfo +from ._schemas import StreamInfo, Config import re import base64 +from PIL import Image +from io import BytesIO class DocumentConverterResult: @@ -13,6 +15,7 @@ class DocumentConverterResult: def __init__( self, markdown: str = "", + config: Optional[Config] = None, *, title: Optional[str] = None, audio_stream: Optional[BinaryIO] = None, @@ -20,18 +23,22 @@ class DocumentConverterResult: ): """ Initialize the DocumentConverterResult. - + The only required parameter is the converted Markdown text. The title, and any other metadata that may be added in the future, are optional. - + Parameters: - markdown: The converted Markdown text. + - config: Optional configuration settings. - title: Optional title of the document. + - audio_stream: Optional audio data. + - stream_info: Optional stream information. """ self.markdown = markdown self.audio_stream = audio_stream self.title = title self.stream_info = stream_info + self.config = config def to_llm(self) -> List[Dict[str, Any]]: """ @@ -65,6 +72,18 @@ class DocumentConverterResult: # Extract image data alt_text, content_type, b64_data = match.groups() + if self.config.image_use_webp: + # Decode base64 data + img_data = base64.b64decode(b64_data) + + # Check if it's already a WebP image + if "webp" not in content_type.lower(): + # Convert to WebP + webp_data = self._convert_image_to_webp(img_data) + # Replace with WebP data + b64_data = base64.b64encode(webp_data).decode('utf-8') + content_type = "image/webp" + # Add the image content.append({ "type": "image", @@ -94,6 +113,28 @@ class DocumentConverterResult: }) return content + def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes: + """ + Convert image data to WebP format. + + Parameters: + - image_data: The original image data as bytes. + - quality: The quality setting (0-100) for WebP conversion. + + Returns: + - WebP converted image data as bytes. + """ + img = Image.open(BytesIO(image_data)) + # Convert to RGB if image has alpha channel or is not in RGB mode + if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'): + img = img.convert('RGB') + + # Save as WebP to a BytesIO object + webp_buffer = BytesIO() + img.save(webp_buffer, format="WEBP", quality=quality) + webp_buffer.seek(0) + return webp_buffer.read() + @property def text_content(self) -> str: """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index a0b2186..c392ce4 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config from .converters import ( PlainTextConverter, HtmlConverter, + ImageConverter, PdfConverter, DocxConverter, XlsxConverter, @@ -55,14 +56,21 @@ class MarkItUp: case "xls": return XlsConverter(config=self.config).convert(stream, stream_info), stream_info case "csv": - return CsvConverter().convert(stream, stream_info), stream_info + return CsvConverter(config=self.config).convert(stream, stream_info), stream_info case "docx": return DocxConverter(config=self.config).convert(stream, stream_info), stream_info + case "image": + return ImageConverter(config=self.config).convert(stream, stream_info), stream_info + case "html": + return HtmlConverter(config=self.config).convert(stream, stream_info), stream_info case _: match stream_info.category: case "ppt": raise UnsupportedFormatException( ".ppt files are not supported, try .pptx instead") + case "doc": + raise UnsupportedFormatException( + ".doc files are not supported, try .docx instead") case "other": raise UnsupportedFormatException( f"{stream_info.magic_type} files are not supported") @@ -84,7 +92,10 @@ class MarkItUp: # Determine file category based on magic_type if magic_type.startswith("image/"): - category = "image" + if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]: + category = "image" + else: + category = "other" elif magic_type.startswith("audio/"): category = "audio" elif magic_type.startswith("video/"): @@ -108,6 +119,8 @@ class MarkItUp: elif magic_type.startswith("text/"): if magic_type == "text/csv": category = "csv" + elif magic_type == "text/html": + category = "html" else: category = "text" else: diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py index 9cbe1c9..36adf09 100644 --- a/packages/markitup/src/markitup/_schemas.py +++ b/packages/markitup/src/markitup/_schemas.py @@ -13,3 +13,4 @@ class Config: modalities: List[Literal["image", "audio"]] = field( default_factory=lambda: ["image", "audio"] ) + image_use_webp: bool = True # TODO: support files contains images diff --git a/packages/markitup/src/markitup/converters/__init__.py b/packages/markitup/src/markitup/converters/__init__.py index 5aea2af..bec2517 100644 --- a/packages/markitup/src/markitup/converters/__init__.py +++ b/packages/markitup/src/markitup/converters/__init__.py @@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._audio_converter import AudioConverter from ._csv_converter import CsvConverter +from ._image_converter import ImageConverter from ._markdownify import _CustomMarkdownify __all__ = [ @@ -19,6 +20,7 @@ __all__ = [ "_CustomMarkdownify", "WikipediaConverter", "YouTubeConverter", + "ImageConverter" "IpynbConverter", "BingSerpConverter", "PdfConverter", diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py index eec84cd..19759ea 100644 --- a/packages/markitup/src/markitup/converters/_audio_converter.py +++ b/packages/markitup/src/markitup/converters/_audio_converter.py @@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter): file_stream, magic_type=stream_info.magic_type) if transcript: md_content += "\n\n### Audio Transcript:\n" + transcript - return DocumentConverterResult(markdown=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip(), config=self.config) else: - return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info) + return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config) # Return the result diff --git a/packages/markitup/src/markitup/converters/_csv_converter.py b/packages/markitup/src/markitup/converters/_csv_converter.py index c68afe2..d7e6971 100644 --- a/packages/markitup/src/markitup/converters/_csv_converter.py +++ b/packages/markitup/src/markitup/converters/_csv_converter.py @@ -4,13 +4,15 @@ import io from typing import BinaryIO, Any from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. """ + def __init__(self, config: Config): + self.config = config def convert( self, @@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter): result = "\n".join(markdown_table) - return DocumentConverterResult(markdown=result) + return DocumentConverterResult(markdown=result, config=self.config) diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py index e41b34d..b7b1ac1 100644 --- a/packages/markitup/src/markitup/converters/_html_converter.py +++ b/packages/markitup/src/markitup/converters/_html_converter.py @@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter): # remove leading and trailing \n webpage_text = webpage_text.strip() - print(webpage_text) + return DocumentConverterResult( markdown=webpage_text, title=None if soup.title is None else soup.title.string, + config=self.config, ) def convert_string( diff --git a/packages/markitup/src/markitup/converters/_image_converter.py b/packages/markitup/src/markitup/converters/_image_converter.py new file mode 100644 index 0000000..2c99a78 --- /dev/null +++ b/packages/markitup/src/markitup/converters/_image_converter.py @@ -0,0 +1,48 @@ +from typing import BinaryIO, Any +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._schemas import StreamInfo, Config +import base64 + + +class ImageConverter(DocumentConverter): + """ + Converts image files to markdown with embedded base64 image. + """ + + def __init__(self, config: Config): + self.config = config + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Read the image data + image_bytes = file_stream.read() + + # Determine image extension from magic_type + image_ext = "png" # Default extension + match stream_info.magic_type: + case "image/jpeg" | "image/jpg": + image_ext = "jpeg" + case "image/png": + image_ext = "png" + case "image/webp": + image_ext = "webp" + + if 'image' in self.config.modalities: + img_base64 = base64.b64encode(image_bytes).decode('utf-8') + + # Create markdown with embedded image + markdown_content = f"![Image](data:image/{image_ext};base64,{img_base64})\n\n" + + return DocumentConverterResult( + markdown=markdown_content, + config=self.config, + ) + else: + return DocumentConverterResult( + markdown="No Image read as the supported modalities do not include 'image'", + config=self.config, + ) \ No newline at end of file diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py index 8839eff..dc1f817 100644 --- a/packages/markitup/src/markitup/converters/_pdf_converter.py +++ b/packages/markitup/src/markitup/converters/_pdf_converter.py @@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter): doc.close() return DocumentConverterResult( markdown=markdown_content, + config=self.config, ) diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index 31af3cb..65b1778 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter): md_content += notes_frame.text md_content = md_content.strip() - return DocumentConverterResult(markdown=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip(), config=self.config) def _is_picture(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: diff --git a/packages/markitup/tests/test_files/test.png b/packages/markitup/tests/test_files/test.png new file mode 100644 index 0000000..55efaf8 Binary files /dev/null and b/packages/markitup/tests/test_files/test.png differ diff --git a/packages/markitup/tests/test_files/test.webp b/packages/markitup/tests/test_files/test.webp new file mode 100644 index 0000000..122741b Binary files /dev/null and b/packages/markitup/tests/test_files/test.webp differ diff --git a/packages/markitup/tests/test_files/test_llm.jpg b/packages/markitup/tests/test_files/test_llm.jpg deleted file mode 100644 index 1f358fe..0000000 Binary files a/packages/markitup/tests/test_files/test_llm.jpg and /dev/null differ