diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py index 0d5300d..7cc818a 100644 --- a/packages/markitup/src/markitup/_base_converter.py +++ b/packages/markitup/src/markitup/_base_converter.py @@ -2,9 +2,11 @@ import os import tempfile from warnings import warn from typing import Any, Union, BinaryIO, Optional, List, Dict -from ._schemas import StreamInfo +from ._schemas import StreamInfo, Config import re import base64 +from PIL import Image +from io import BytesIO class DocumentConverterResult: @@ -13,6 +15,7 @@ class DocumentConverterResult: def __init__( self, markdown: str = "", + config: Optional[Config] = None, *, title: Optional[str] = None, audio_stream: Optional[BinaryIO] = None, @@ -20,18 +23,22 @@ class DocumentConverterResult: ): """ Initialize the DocumentConverterResult. - + The only required parameter is the converted Markdown text. The title, and any other metadata that may be added in the future, are optional. - + Parameters: - markdown: The converted Markdown text. + - config: Optional configuration settings. - title: Optional title of the document. + - audio_stream: Optional audio data. + - stream_info: Optional stream information. """ self.markdown = markdown self.audio_stream = audio_stream self.title = title self.stream_info = stream_info + self.config = config def to_llm(self) -> List[Dict[str, Any]]: """ @@ -65,6 +72,18 @@ class DocumentConverterResult: # Extract image data alt_text, content_type, b64_data = match.groups() + if self.config.image_use_webp: + # Decode base64 data + img_data = base64.b64decode(b64_data) + + # Check if it's already a WebP image + if "webp" not in content_type.lower(): + # Convert to WebP + webp_data = self._convert_image_to_webp(img_data) + # Replace with WebP data + b64_data = base64.b64encode(webp_data).decode('utf-8') + content_type = "image/webp" + # Add the image content.append({ "type": "image", @@ -94,6 +113,28 @@ class DocumentConverterResult: }) return content + def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes: + """ + Convert image data to WebP format. + + Parameters: + - image_data: The original image data as bytes. + - quality: The quality setting (0-100) for WebP conversion. + + Returns: + - WebP converted image data as bytes. + """ + img = Image.open(BytesIO(image_data)) + # Convert to RGB if image has alpha channel or is not in RGB mode + if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'): + img = img.convert('RGB') + + # Save as WebP to a BytesIO object + webp_buffer = BytesIO() + img.save(webp_buffer, format="WEBP", quality=quality) + webp_buffer.seek(0) + return webp_buffer.read() + @property def text_content(self) -> str: """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index 1cc8cad..5e585d0 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -56,7 +56,7 @@ class MarkItUp: case "xls": return XlsConverter(config=self.config).convert(stream, stream_info), stream_info case "csv": - return CsvConverter().convert(stream, stream_info), stream_info + return CsvConverter(config=self.config).convert(stream, stream_info), stream_info case "docx": return DocxConverter(config=self.config).convert(stream, stream_info), stream_info case "image": diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py index eec84cd..19759ea 100644 --- a/packages/markitup/src/markitup/converters/_audio_converter.py +++ b/packages/markitup/src/markitup/converters/_audio_converter.py @@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter): file_stream, magic_type=stream_info.magic_type) if transcript: md_content += "\n\n### Audio Transcript:\n" + transcript - return DocumentConverterResult(markdown=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip(), config=self.config) else: - return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info) + return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config) # Return the result diff --git a/packages/markitup/src/markitup/converters/_csv_converter.py b/packages/markitup/src/markitup/converters/_csv_converter.py index c68afe2..d7e6971 100644 --- a/packages/markitup/src/markitup/converters/_csv_converter.py +++ b/packages/markitup/src/markitup/converters/_csv_converter.py @@ -4,13 +4,15 @@ import io from typing import BinaryIO, Any from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. """ + def __init__(self, config: Config): + self.config = config def convert( self, @@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter): result = "\n".join(markdown_table) - return DocumentConverterResult(markdown=result) + return DocumentConverterResult(markdown=result, config=self.config) diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py index e41b34d..b7b1ac1 100644 --- a/packages/markitup/src/markitup/converters/_html_converter.py +++ b/packages/markitup/src/markitup/converters/_html_converter.py @@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter): # remove leading and trailing \n webpage_text = webpage_text.strip() - print(webpage_text) + return DocumentConverterResult( markdown=webpage_text, title=None if soup.title is None else soup.title.string, + config=self.config, ) def convert_string( diff --git a/packages/markitup/src/markitup/converters/_image_converter.py b/packages/markitup/src/markitup/converters/_image_converter.py index de05512..2c99a78 100644 --- a/packages/markitup/src/markitup/converters/_image_converter.py +++ b/packages/markitup/src/markitup/converters/_image_converter.py @@ -39,8 +39,10 @@ class ImageConverter(DocumentConverter): return DocumentConverterResult( markdown=markdown_content, + config=self.config, ) else: return DocumentConverterResult( markdown="No Image read as the supported modalities do not include 'image'", + config=self.config, ) \ No newline at end of file diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py index 8839eff..dc1f817 100644 --- a/packages/markitup/src/markitup/converters/_pdf_converter.py +++ b/packages/markitup/src/markitup/converters/_pdf_converter.py @@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter): doc.close() return DocumentConverterResult( markdown=markdown_content, + config=self.config, ) diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index 31af3cb..65b1778 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter): md_content += notes_frame.text md_content = md_content.strip() - return DocumentConverterResult(markdown=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip(), config=self.config) def _is_picture(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: