Merge pull request #3 from pathintegral-institute/rong/tech-141-image-and-html-support

Rong/tech 141 image and html support
2025-04-23 14:44:36 +08:00 · 2025-04-23 14:44:36 +08:00 · ff31c019df
commit ff31c019df
parent cd85971867 71e55ba93e
13 changed files with 120 additions and 11 deletions
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@ -2,9 +2,11 @@ import os
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List, Dict
-from ._schemas import StreamInfo
+from ._schemas import StreamInfo, Config
 import re
 import base64
+from PIL import Image
+from io import BytesIO


 class DocumentConverterResult:
@ -13,6 +15,7 @@ class DocumentConverterResult:
    def __init__(
        self,
        markdown: str = "",
+        config: Optional[Config] = None,
        *,
        title: Optional[str] = None,
        audio_stream: Optional[BinaryIO] = None,
@ -20,18 +23,22 @@ class DocumentConverterResult:
    ):
        """
        Initialize the DocumentConverterResult.
-
+        
        The only required parameter is the converted Markdown text.
        The title, and any other metadata that may be added in the future, are optional.
-
+        
        Parameters:
        - markdown: The converted Markdown text.
+        - config: Optional configuration settings.
        - title: Optional title of the document.
+        - audio_stream: Optional audio data.
+        - stream_info: Optional stream information.
        """
        self.markdown = markdown
        self.audio_stream = audio_stream
        self.title = title
        self.stream_info = stream_info
+        self.config = config

    def to_llm(self) -> List[Dict[str, Any]]:
        """
@ -65,6 +72,18 @@ class DocumentConverterResult:
            # Extract image data
            alt_text, content_type, b64_data = match.groups()

+            if self.config.image_use_webp:
+                # Decode base64 data
+                img_data = base64.b64decode(b64_data)
+                
+                # Check if it's already a WebP image
+                if "webp" not in content_type.lower():
+                    # Convert to WebP
+                    webp_data = self._convert_image_to_webp(img_data)
+                    # Replace with WebP data
+                    b64_data = base64.b64encode(webp_data).decode('utf-8')
+                    content_type = "image/webp"
+
            # Add the image
            content.append({
                "type": "image",
@ -94,6 +113,28 @@ class DocumentConverterResult:
            })
        return content

+    def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes:
+        """
+        Convert image data to WebP format.
+        
+        Parameters:
+        - image_data: The original image data as bytes.
+        - quality: The quality setting (0-100) for WebP conversion.
+        
+        Returns:
+        - WebP converted image data as bytes.
+        """
+        img = Image.open(BytesIO(image_data))
+        # Convert to RGB if image has alpha channel or is not in RGB mode
+        if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'):
+            img = img.convert('RGB')
+            
+        # Save as WebP to a BytesIO object
+        webp_buffer = BytesIO()
+        img.save(webp_buffer, format="WEBP", quality=quality)
+        webp_buffer.seek(0)
+        return webp_buffer.read()
+
    @property
    def text_content(self) -> str:
        """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config
 from .converters import (
    PlainTextConverter,
    HtmlConverter,
+    ImageConverter,
    PdfConverter,
    DocxConverter,
    XlsxConverter,
@ -55,14 +56,21 @@ class MarkItUp:
                case "xls":
                    return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
                case "csv":
-                    return CsvConverter().convert(stream, stream_info), stream_info
+                    return CsvConverter(config=self.config).convert(stream, stream_info), stream_info
                case "docx":
                    return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "image":
+                    return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "html":
+                    return HtmlConverter(config=self.config).convert(stream, stream_info), stream_info
                case _:
                    match stream_info.category:
                        case "ppt":
                            raise UnsupportedFormatException(
                                ".ppt files are not supported, try .pptx instead")
+                        case "doc":
+                            raise UnsupportedFormatException(
+                                ".doc files are not supported, try .docx instead")
                        case "other":
                            raise UnsupportedFormatException(
                                f"{stream_info.magic_type} files are not supported")
@ -84,7 +92,10 @@ class MarkItUp:

        # Determine file category based on magic_type
        if magic_type.startswith("image/"):
-            category = "image"
+            if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]:
+                category = "image"
+            else:
+                category = "other"
        elif magic_type.startswith("audio/"):
            category = "audio"
        elif magic_type.startswith("video/"):
@ -108,6 +119,8 @@ class MarkItUp:
        elif magic_type.startswith("text/"):
            if magic_type == "text/csv":
                category = "csv"
+            elif magic_type == "text/html":
+                category = "html"
            else:
                category = "text"
        else:
--- a/packages/markitup/src/markitup/_schemas.py
+++ b/packages/markitup/src/markitup/_schemas.py
@ -13,3 +13,4 @@ class Config:
    modalities: List[Literal["image", "audio"]] = field(
        default_factory=lambda: ["image", "audio"]
    )
+    image_use_webp: bool = True  # TODO: support files contains images
--- a/packages/markitup/src/markitup/converters/init.py
+++ b/packages/markitup/src/markitup/converters/init.py
@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._audio_converter import AudioConverter
 from ._csv_converter import CsvConverter
+from ._image_converter import ImageConverter
 from ._markdownify import _CustomMarkdownify

 __all__ = [
@ -19,6 +20,7 @@ __all__ = [
    "_CustomMarkdownify",
    "WikipediaConverter",
    "YouTubeConverter",
+    "ImageConverter"
    "IpynbConverter",
    "BingSerpConverter",
    "PdfConverter",
--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter):
                file_stream, magic_type=stream_info.magic_type)
            if transcript:
                md_content += "\n\n### Audio Transcript:\n" + transcript
-            return DocumentConverterResult(markdown=md_content.strip())
+            return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
        else:
-            return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
+            return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config)

        # Return the result
--- a/packages/markitup/src/markitup/converters/_csv_converter.py
+++ b/packages/markitup/src/markitup/converters/_csv_converter.py
@ -4,13 +4,15 @@ import io
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
+from .._schemas import StreamInfo, Config


 class CsvConverter(DocumentConverter):
    """
    Converts CSV files to Markdown tables.
    """
+    def __init__(self, config: Config):
+        self.config = config

    def convert(
        self,
@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter):

        result = "\n".join(markdown_table)

-        return DocumentConverterResult(markdown=result)
+        return DocumentConverterResult(markdown=result, config=self.config)
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter):

        # remove leading and trailing \n
        webpage_text = webpage_text.strip()
-        print(webpage_text)
+
        return DocumentConverterResult(
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
+            config=self.config,
        )

    def convert_string(
--- a/packages/markitup/src/markitup/converters/_image_converter.py
+++ b/packages/markitup/src/markitup/converters/_image_converter.py
@ -0,0 +1,48 @@
+from typing import BinaryIO, Any
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._schemas import StreamInfo, Config
+import base64
+
+
+class ImageConverter(DocumentConverter):
+    """
+    Converts image files to markdown with embedded base64 image.
+    """
+
+    def __init__(self, config: Config):
+        self.config = config
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Read the image data
+        image_bytes = file_stream.read()
+
+        # Determine image extension from magic_type
+        image_ext = "png"  # Default extension
+        match stream_info.magic_type:
+            case "image/jpeg" | "image/jpg":
+                image_ext = "jpeg"
+            case "image/png":
+                image_ext = "png"
+            case "image/webp":
+                image_ext = "webp"
+
+        if 'image' in self.config.modalities:
+            img_base64 = base64.b64encode(image_bytes).decode('utf-8')
+
+            # Create markdown with embedded image
+            markdown_content = f"![Image](data:image/{image_ext};base64,{img_base64})\n\n"
+
+            return DocumentConverterResult(
+                markdown=markdown_content,
+                config=self.config,
+            )
+        else:
+            return DocumentConverterResult(
+                markdown="No Image read as the supported modalities do not include 'image'",
+                config=self.config,
+            )
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter):
        doc.close()
        return DocumentConverterResult(
            markdown=markdown_content,
+            config=self.config,
        )
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter):
                    md_content += notes_frame.text
                md_content = md_content.strip()

-        return DocumentConverterResult(markdown=md_content.strip())
+        return DocumentConverterResult(markdown=md_content.strip(), config=self.config)

    def _is_picture(self, shape):
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
--- a/packages/markitup/tests/test_files/test.png
+++ b/packages/markitup/tests/test_files/test.png
--- a/packages/markitup/tests/test_files/test.webp
+++ b/packages/markitup/tests/test_files/test.webp
--- a/packages/markitup/tests/test_files/test_llm.jpg
+++ b/packages/markitup/tests/test_files/test_llm.jpg