finished webp support
This commit is contained in:
parent
46b44d3ebd
commit
bc67a318a1
8 changed files with 57 additions and 10 deletions
|
|
@ -2,9 +2,11 @@ import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||||
from ._schemas import StreamInfo
|
from ._schemas import StreamInfo, Config
|
||||||
import re
|
import re
|
||||||
import base64
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
class DocumentConverterResult:
|
||||||
|
|
@ -13,6 +15,7 @@ class DocumentConverterResult:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
markdown: str = "",
|
markdown: str = "",
|
||||||
|
config: Optional[Config] = None,
|
||||||
*,
|
*,
|
||||||
title: Optional[str] = None,
|
title: Optional[str] = None,
|
||||||
audio_stream: Optional[BinaryIO] = None,
|
audio_stream: Optional[BinaryIO] = None,
|
||||||
|
|
@ -20,18 +23,22 @@ class DocumentConverterResult:
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the DocumentConverterResult.
|
Initialize the DocumentConverterResult.
|
||||||
|
|
||||||
The only required parameter is the converted Markdown text.
|
The only required parameter is the converted Markdown text.
|
||||||
The title, and any other metadata that may be added in the future, are optional.
|
The title, and any other metadata that may be added in the future, are optional.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- markdown: The converted Markdown text.
|
- markdown: The converted Markdown text.
|
||||||
|
- config: Optional configuration settings.
|
||||||
- title: Optional title of the document.
|
- title: Optional title of the document.
|
||||||
|
- audio_stream: Optional audio data.
|
||||||
|
- stream_info: Optional stream information.
|
||||||
"""
|
"""
|
||||||
self.markdown = markdown
|
self.markdown = markdown
|
||||||
self.audio_stream = audio_stream
|
self.audio_stream = audio_stream
|
||||||
self.title = title
|
self.title = title
|
||||||
self.stream_info = stream_info
|
self.stream_info = stream_info
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def to_llm(self) -> List[Dict[str, Any]]:
|
def to_llm(self) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -65,6 +72,18 @@ class DocumentConverterResult:
|
||||||
# Extract image data
|
# Extract image data
|
||||||
alt_text, content_type, b64_data = match.groups()
|
alt_text, content_type, b64_data = match.groups()
|
||||||
|
|
||||||
|
if self.config.image_use_webp:
|
||||||
|
# Decode base64 data
|
||||||
|
img_data = base64.b64decode(b64_data)
|
||||||
|
|
||||||
|
# Check if it's already a WebP image
|
||||||
|
if "webp" not in content_type.lower():
|
||||||
|
# Convert to WebP
|
||||||
|
webp_data = self._convert_image_to_webp(img_data)
|
||||||
|
# Replace with WebP data
|
||||||
|
b64_data = base64.b64encode(webp_data).decode('utf-8')
|
||||||
|
content_type = "image/webp"
|
||||||
|
|
||||||
# Add the image
|
# Add the image
|
||||||
content.append({
|
content.append({
|
||||||
"type": "image",
|
"type": "image",
|
||||||
|
|
@ -94,6 +113,28 @@ class DocumentConverterResult:
|
||||||
})
|
})
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes:
|
||||||
|
"""
|
||||||
|
Convert image data to WebP format.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- image_data: The original image data as bytes.
|
||||||
|
- quality: The quality setting (0-100) for WebP conversion.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- WebP converted image data as bytes.
|
||||||
|
"""
|
||||||
|
img = Image.open(BytesIO(image_data))
|
||||||
|
# Convert to RGB if image has alpha channel or is not in RGB mode
|
||||||
|
if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'):
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
# Save as WebP to a BytesIO object
|
||||||
|
webp_buffer = BytesIO()
|
||||||
|
img.save(webp_buffer, format="WEBP", quality=quality)
|
||||||
|
webp_buffer.seek(0)
|
||||||
|
return webp_buffer.read()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text_content(self) -> str:
|
def text_content(self) -> str:
|
||||||
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ class MarkItUp:
|
||||||
case "xls":
|
case "xls":
|
||||||
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
|
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case "csv":
|
case "csv":
|
||||||
return CsvConverter().convert(stream, stream_info), stream_info
|
return CsvConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case "docx":
|
case "docx":
|
||||||
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case "image":
|
case "image":
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter):
|
||||||
file_stream, magic_type=stream_info.magic_type)
|
file_stream, magic_type=stream_info.magic_type)
|
||||||
if transcript:
|
if transcript:
|
||||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
|
||||||
else:
|
else:
|
||||||
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
|
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config)
|
||||||
|
|
||||||
# Return the result
|
# Return the result
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,15 @@ import io
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._schemas import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
|
|
||||||
|
|
||||||
class CsvConverter(DocumentConverter):
|
class CsvConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts CSV files to Markdown tables.
|
Converts CSV files to Markdown tables.
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, config: Config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter):
|
||||||
|
|
||||||
result = "\n".join(markdown_table)
|
result = "\n".join(markdown_table)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=result)
|
return DocumentConverterResult(markdown=result, config=self.config)
|
||||||
|
|
|
||||||
|
|
@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter):
|
||||||
|
|
||||||
# remove leading and trailing \n
|
# remove leading and trailing \n
|
||||||
webpage_text = webpage_text.strip()
|
webpage_text = webpage_text.strip()
|
||||||
print(webpage_text)
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=webpage_text,
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
|
config=self.config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def convert_string(
|
def convert_string(
|
||||||
|
|
|
||||||
|
|
@ -39,8 +39,10 @@ class ImageConverter(DocumentConverter):
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=markdown_content,
|
markdown=markdown_content,
|
||||||
|
config=self.config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown="No Image read as the supported modalities do not include 'image'",
|
markdown="No Image read as the supported modalities do not include 'image'",
|
||||||
|
config=self.config,
|
||||||
)
|
)
|
||||||
|
|
@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter):
|
||||||
doc.close()
|
doc.close()
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=markdown_content,
|
markdown=markdown_content,
|
||||||
|
config=self.config,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter):
|
||||||
md_content += notes_frame.text
|
md_content += notes_frame.text
|
||||||
md_content = md_content.strip()
|
md_content = md_content.strip()
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
|
||||||
|
|
||||||
def _is_picture(self, shape):
|
def _is_picture(self, shape):
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue