Merge pull request #3 from pathintegral-institute/rong/tech-141-image-and-html-support
Rong/tech 141 image and html support
This commit is contained in:
commit
ff31c019df
13 changed files with 120 additions and 11 deletions
|
|
@ -2,9 +2,11 @@ import os
|
|||
import tempfile
|
||||
from warnings import warn
|
||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||
from ._schemas import StreamInfo
|
||||
from ._schemas import StreamInfo, Config
|
||||
import re
|
||||
import base64
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class DocumentConverterResult:
|
||||
|
|
@ -13,6 +15,7 @@ class DocumentConverterResult:
|
|||
def __init__(
|
||||
self,
|
||||
markdown: str = "",
|
||||
config: Optional[Config] = None,
|
||||
*,
|
||||
title: Optional[str] = None,
|
||||
audio_stream: Optional[BinaryIO] = None,
|
||||
|
|
@ -20,18 +23,22 @@ class DocumentConverterResult:
|
|||
):
|
||||
"""
|
||||
Initialize the DocumentConverterResult.
|
||||
|
||||
|
||||
The only required parameter is the converted Markdown text.
|
||||
The title, and any other metadata that may be added in the future, are optional.
|
||||
|
||||
|
||||
Parameters:
|
||||
- markdown: The converted Markdown text.
|
||||
- config: Optional configuration settings.
|
||||
- title: Optional title of the document.
|
||||
- audio_stream: Optional audio data.
|
||||
- stream_info: Optional stream information.
|
||||
"""
|
||||
self.markdown = markdown
|
||||
self.audio_stream = audio_stream
|
||||
self.title = title
|
||||
self.stream_info = stream_info
|
||||
self.config = config
|
||||
|
||||
def to_llm(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
|
|
@ -65,6 +72,18 @@ class DocumentConverterResult:
|
|||
# Extract image data
|
||||
alt_text, content_type, b64_data = match.groups()
|
||||
|
||||
if self.config.image_use_webp:
|
||||
# Decode base64 data
|
||||
img_data = base64.b64decode(b64_data)
|
||||
|
||||
# Check if it's already a WebP image
|
||||
if "webp" not in content_type.lower():
|
||||
# Convert to WebP
|
||||
webp_data = self._convert_image_to_webp(img_data)
|
||||
# Replace with WebP data
|
||||
b64_data = base64.b64encode(webp_data).decode('utf-8')
|
||||
content_type = "image/webp"
|
||||
|
||||
# Add the image
|
||||
content.append({
|
||||
"type": "image",
|
||||
|
|
@ -94,6 +113,28 @@ class DocumentConverterResult:
|
|||
})
|
||||
return content
|
||||
|
||||
def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes:
|
||||
"""
|
||||
Convert image data to WebP format.
|
||||
|
||||
Parameters:
|
||||
- image_data: The original image data as bytes.
|
||||
- quality: The quality setting (0-100) for WebP conversion.
|
||||
|
||||
Returns:
|
||||
- WebP converted image data as bytes.
|
||||
"""
|
||||
img = Image.open(BytesIO(image_data))
|
||||
# Convert to RGB if image has alpha channel or is not in RGB mode
|
||||
if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'):
|
||||
img = img.convert('RGB')
|
||||
|
||||
# Save as WebP to a BytesIO object
|
||||
webp_buffer = BytesIO()
|
||||
img.save(webp_buffer, format="WEBP", quality=quality)
|
||||
webp_buffer.seek(0)
|
||||
return webp_buffer.read()
|
||||
|
||||
@property
|
||||
def text_content(self) -> str:
|
||||
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config
|
|||
from .converters import (
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
ImageConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
XlsxConverter,
|
||||
|
|
@ -55,14 +56,21 @@ class MarkItUp:
|
|||
case "xls":
|
||||
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "csv":
|
||||
return CsvConverter().convert(stream, stream_info), stream_info
|
||||
return CsvConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "docx":
|
||||
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "image":
|
||||
return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "html":
|
||||
return HtmlConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case _:
|
||||
match stream_info.category:
|
||||
case "ppt":
|
||||
raise UnsupportedFormatException(
|
||||
".ppt files are not supported, try .pptx instead")
|
||||
case "doc":
|
||||
raise UnsupportedFormatException(
|
||||
".doc files are not supported, try .docx instead")
|
||||
case "other":
|
||||
raise UnsupportedFormatException(
|
||||
f"{stream_info.magic_type} files are not supported")
|
||||
|
|
@ -84,7 +92,10 @@ class MarkItUp:
|
|||
|
||||
# Determine file category based on magic_type
|
||||
if magic_type.startswith("image/"):
|
||||
category = "image"
|
||||
if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]:
|
||||
category = "image"
|
||||
else:
|
||||
category = "other"
|
||||
elif magic_type.startswith("audio/"):
|
||||
category = "audio"
|
||||
elif magic_type.startswith("video/"):
|
||||
|
|
@ -108,6 +119,8 @@ class MarkItUp:
|
|||
elif magic_type.startswith("text/"):
|
||||
if magic_type == "text/csv":
|
||||
category = "csv"
|
||||
elif magic_type == "text/html":
|
||||
category = "html"
|
||||
else:
|
||||
category = "text"
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -13,3 +13,4 @@ class Config:
|
|||
modalities: List[Literal["image", "audio"]] = field(
|
||||
default_factory=lambda: ["image", "audio"]
|
||||
)
|
||||
image_use_webp: bool = True # TODO: support files contains images
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter
|
|||
from ._pptx_converter import PptxConverter
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._csv_converter import CsvConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -19,6 +20,7 @@ __all__ = [
|
|||
"_CustomMarkdownify",
|
||||
"WikipediaConverter",
|
||||
"YouTubeConverter",
|
||||
"ImageConverter"
|
||||
"IpynbConverter",
|
||||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
|
|
|
|||
|
|
@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter):
|
|||
file_stream, magic_type=stream_info.magic_type)
|
||||
if transcript:
|
||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
|
||||
else:
|
||||
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
|
||||
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config)
|
||||
|
||||
# Return the result
|
||||
|
|
|
|||
|
|
@ -4,13 +4,15 @@ import io
|
|||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo
|
||||
from .._schemas import StreamInfo, Config
|
||||
|
||||
|
||||
class CsvConverter(DocumentConverter):
|
||||
"""
|
||||
Converts CSV files to Markdown tables.
|
||||
"""
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
|
||||
def convert(
|
||||
self,
|
||||
|
|
@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter):
|
|||
|
||||
result = "\n".join(markdown_table)
|
||||
|
||||
return DocumentConverterResult(markdown=result)
|
||||
return DocumentConverterResult(markdown=result, config=self.config)
|
||||
|
|
|
|||
|
|
@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter):
|
|||
|
||||
# remove leading and trailing \n
|
||||
webpage_text = webpage_text.strip()
|
||||
print(webpage_text)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
config=self.config,
|
||||
)
|
||||
|
||||
def convert_string(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
from typing import BinaryIO, Any
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo, Config
|
||||
import base64
|
||||
|
||||
|
||||
class ImageConverter(DocumentConverter):
|
||||
"""
|
||||
Converts image files to markdown with embedded base64 image.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the image data
|
||||
image_bytes = file_stream.read()
|
||||
|
||||
# Determine image extension from magic_type
|
||||
image_ext = "png" # Default extension
|
||||
match stream_info.magic_type:
|
||||
case "image/jpeg" | "image/jpg":
|
||||
image_ext = "jpeg"
|
||||
case "image/png":
|
||||
image_ext = "png"
|
||||
case "image/webp":
|
||||
image_ext = "webp"
|
||||
|
||||
if 'image' in self.config.modalities:
|
||||
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||
|
||||
# Create markdown with embedded image
|
||||
markdown_content = f"\n\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=markdown_content,
|
||||
config=self.config,
|
||||
)
|
||||
else:
|
||||
return DocumentConverterResult(
|
||||
markdown="No Image read as the supported modalities do not include 'image'",
|
||||
config=self.config,
|
||||
)
|
||||
|
|
@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter):
|
|||
doc.close()
|
||||
return DocumentConverterResult(
|
||||
markdown=markdown_content,
|
||||
config=self.config,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter):
|
|||
md_content += notes_frame.text
|
||||
md_content = md_content.strip()
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
|
||||
|
||||
def _is_picture(self, shape):
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||
|
|
|
|||
BIN
packages/markitup/tests/test_files/test.png
Normal file
BIN
packages/markitup/tests/test_files/test.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.8 MiB |
BIN
packages/markitup/tests/test_files/test.webp
Normal file
BIN
packages/markitup/tests/test_files/test.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 145 KiB |
Loading…
Reference in a new issue