finished webp support

This commit is contained in:
rong-xyz 2025-04-23 06:39:24 +00:00
parent 46b44d3ebd
commit bc67a318a1
8 changed files with 57 additions and 10 deletions

View file

@ -2,9 +2,11 @@ import os
import tempfile import tempfile
from warnings import warn from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._schemas import StreamInfo from ._schemas import StreamInfo, Config
import re import re
import base64 import base64
from PIL import Image
from io import BytesIO
class DocumentConverterResult: class DocumentConverterResult:
@ -13,6 +15,7 @@ class DocumentConverterResult:
def __init__( def __init__(
self, self,
markdown: str = "", markdown: str = "",
config: Optional[Config] = None,
*, *,
title: Optional[str] = None, title: Optional[str] = None,
audio_stream: Optional[BinaryIO] = None, audio_stream: Optional[BinaryIO] = None,
@ -20,18 +23,22 @@ class DocumentConverterResult:
): ):
""" """
Initialize the DocumentConverterResult. Initialize the DocumentConverterResult.
The only required parameter is the converted Markdown text. The only required parameter is the converted Markdown text.
The title, and any other metadata that may be added in the future, are optional. The title, and any other metadata that may be added in the future, are optional.
Parameters: Parameters:
- markdown: The converted Markdown text. - markdown: The converted Markdown text.
- config: Optional configuration settings.
- title: Optional title of the document. - title: Optional title of the document.
- audio_stream: Optional audio data.
- stream_info: Optional stream information.
""" """
self.markdown = markdown self.markdown = markdown
self.audio_stream = audio_stream self.audio_stream = audio_stream
self.title = title self.title = title
self.stream_info = stream_info self.stream_info = stream_info
self.config = config
def to_llm(self) -> List[Dict[str, Any]]: def to_llm(self) -> List[Dict[str, Any]]:
""" """
@ -65,6 +72,18 @@ class DocumentConverterResult:
# Extract image data # Extract image data
alt_text, content_type, b64_data = match.groups() alt_text, content_type, b64_data = match.groups()
if self.config.image_use_webp:
# Decode base64 data
img_data = base64.b64decode(b64_data)
# Check if it's already a WebP image
if "webp" not in content_type.lower():
# Convert to WebP
webp_data = self._convert_image_to_webp(img_data)
# Replace with WebP data
b64_data = base64.b64encode(webp_data).decode('utf-8')
content_type = "image/webp"
# Add the image # Add the image
content.append({ content.append({
"type": "image", "type": "image",
@ -94,6 +113,28 @@ class DocumentConverterResult:
}) })
return content return content
def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes:
"""
Convert image data to WebP format.
Parameters:
- image_data: The original image data as bytes.
- quality: The quality setting (0-100) for WebP conversion.
Returns:
- WebP converted image data as bytes.
"""
img = Image.open(BytesIO(image_data))
# Convert to RGB if image has alpha channel or is not in RGB mode
if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'):
img = img.convert('RGB')
# Save as WebP to a BytesIO object
webp_buffer = BytesIO()
img.save(webp_buffer, format="WEBP", quality=quality)
webp_buffer.seek(0)
return webp_buffer.read()
@property @property
def text_content(self) -> str: def text_content(self) -> str:
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""

View file

@ -56,7 +56,7 @@ class MarkItUp:
case "xls": case "xls":
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
case "csv": case "csv":
return CsvConverter().convert(stream, stream_info), stream_info return CsvConverter(config=self.config).convert(stream, stream_info), stream_info
case "docx": case "docx":
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
case "image": case "image":

View file

@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter):
file_stream, magic_type=stream_info.magic_type) file_stream, magic_type=stream_info.magic_type)
if transcript: if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript md_content += "\n\n### Audio Transcript:\n" + transcript
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
else: else:
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info) return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config)
# Return the result # Return the result

View file

@ -4,13 +4,15 @@ import io
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo from .._schemas import StreamInfo, Config
class CsvConverter(DocumentConverter): class CsvConverter(DocumentConverter):
""" """
Converts CSV files to Markdown tables. Converts CSV files to Markdown tables.
""" """
def __init__(self, config: Config):
self.config = config
def convert( def convert(
self, self,
@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter):
result = "\n".join(markdown_table) result = "\n".join(markdown_table)
return DocumentConverterResult(markdown=result) return DocumentConverterResult(markdown=result, config=self.config)

View file

@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter):
# remove leading and trailing \n # remove leading and trailing \n
webpage_text = webpage_text.strip() webpage_text = webpage_text.strip()
print(webpage_text)
return DocumentConverterResult( return DocumentConverterResult(
markdown=webpage_text, markdown=webpage_text,
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,
config=self.config,
) )
def convert_string( def convert_string(

View file

@ -39,8 +39,10 @@ class ImageConverter(DocumentConverter):
return DocumentConverterResult( return DocumentConverterResult(
markdown=markdown_content, markdown=markdown_content,
config=self.config,
) )
else: else:
return DocumentConverterResult( return DocumentConverterResult(
markdown="No Image read as the supported modalities do not include 'image'", markdown="No Image read as the supported modalities do not include 'image'",
config=self.config,
) )

View file

@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter):
doc.close() doc.close()
return DocumentConverterResult( return DocumentConverterResult(
markdown=markdown_content, markdown=markdown_content,
config=self.config,
) )

View file

@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter):
md_content += notes_frame.text md_content += notes_frame.text
md_content = md_content.strip() md_content = md_content.strip()
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
def _is_picture(self, shape): def _is_picture(self, shape):
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: