Merge pull request #3 from pathintegral-institute/rong/tech-141-image-and-html-support

Rong/tech 141 image and html support
This commit is contained in:
rong-xyz 2025-04-23 14:44:36 +08:00 committed by GitHub
commit ff31c019df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 120 additions and 11 deletions

View file

@ -2,9 +2,11 @@ import os
import tempfile import tempfile
from warnings import warn from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._schemas import StreamInfo from ._schemas import StreamInfo, Config
import re import re
import base64 import base64
from PIL import Image
from io import BytesIO
class DocumentConverterResult: class DocumentConverterResult:
@ -13,6 +15,7 @@ class DocumentConverterResult:
def __init__( def __init__(
self, self,
markdown: str = "", markdown: str = "",
config: Optional[Config] = None,
*, *,
title: Optional[str] = None, title: Optional[str] = None,
audio_stream: Optional[BinaryIO] = None, audio_stream: Optional[BinaryIO] = None,
@ -26,12 +29,16 @@ class DocumentConverterResult:
Parameters: Parameters:
- markdown: The converted Markdown text. - markdown: The converted Markdown text.
- config: Optional configuration settings.
- title: Optional title of the document. - title: Optional title of the document.
- audio_stream: Optional audio data.
- stream_info: Optional stream information.
""" """
self.markdown = markdown self.markdown = markdown
self.audio_stream = audio_stream self.audio_stream = audio_stream
self.title = title self.title = title
self.stream_info = stream_info self.stream_info = stream_info
self.config = config
def to_llm(self) -> List[Dict[str, Any]]: def to_llm(self) -> List[Dict[str, Any]]:
""" """
@ -65,6 +72,18 @@ class DocumentConverterResult:
# Extract image data # Extract image data
alt_text, content_type, b64_data = match.groups() alt_text, content_type, b64_data = match.groups()
if self.config.image_use_webp:
# Decode base64 data
img_data = base64.b64decode(b64_data)
# Check if it's already a WebP image
if "webp" not in content_type.lower():
# Convert to WebP
webp_data = self._convert_image_to_webp(img_data)
# Replace with WebP data
b64_data = base64.b64encode(webp_data).decode('utf-8')
content_type = "image/webp"
# Add the image # Add the image
content.append({ content.append({
"type": "image", "type": "image",
@ -94,6 +113,28 @@ class DocumentConverterResult:
}) })
return content return content
def _convert_image_to_webp(self, image_data: bytes, quality: int = 80) -> bytes:
"""
Convert image data to WebP format.
Parameters:
- image_data: The original image data as bytes.
- quality: The quality setting (0-100) for WebP conversion.
Returns:
- WebP converted image data as bytes.
"""
img = Image.open(BytesIO(image_data))
# Convert to RGB if image has alpha channel or is not in RGB mode
if img.mode in ('RGBA', 'LA') or (img.mode != 'RGB' and img.mode != 'L'):
img = img.convert('RGB')
# Save as WebP to a BytesIO object
webp_buffer = BytesIO()
img.save(webp_buffer, format="WEBP", quality=quality)
webp_buffer.seek(0)
return webp_buffer.read()
@property @property
def text_content(self) -> str: def text_content(self) -> str:
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""

View file

@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config
from .converters import ( from .converters import (
PlainTextConverter, PlainTextConverter,
HtmlConverter, HtmlConverter,
ImageConverter,
PdfConverter, PdfConverter,
DocxConverter, DocxConverter,
XlsxConverter, XlsxConverter,
@ -55,14 +56,21 @@ class MarkItUp:
case "xls": case "xls":
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
case "csv": case "csv":
return CsvConverter().convert(stream, stream_info), stream_info return CsvConverter(config=self.config).convert(stream, stream_info), stream_info
case "docx": case "docx":
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
case "image":
return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
case "html":
return HtmlConverter(config=self.config).convert(stream, stream_info), stream_info
case _: case _:
match stream_info.category: match stream_info.category:
case "ppt": case "ppt":
raise UnsupportedFormatException( raise UnsupportedFormatException(
".ppt files are not supported, try .pptx instead") ".ppt files are not supported, try .pptx instead")
case "doc":
raise UnsupportedFormatException(
".doc files are not supported, try .docx instead")
case "other": case "other":
raise UnsupportedFormatException( raise UnsupportedFormatException(
f"{stream_info.magic_type} files are not supported") f"{stream_info.magic_type} files are not supported")
@ -84,7 +92,10 @@ class MarkItUp:
# Determine file category based on magic_type # Determine file category based on magic_type
if magic_type.startswith("image/"): if magic_type.startswith("image/"):
category = "image" if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]:
category = "image"
else:
category = "other"
elif magic_type.startswith("audio/"): elif magic_type.startswith("audio/"):
category = "audio" category = "audio"
elif magic_type.startswith("video/"): elif magic_type.startswith("video/"):
@ -108,6 +119,8 @@ class MarkItUp:
elif magic_type.startswith("text/"): elif magic_type.startswith("text/"):
if magic_type == "text/csv": if magic_type == "text/csv":
category = "csv" category = "csv"
elif magic_type == "text/html":
category = "html"
else: else:
category = "text" category = "text"
else: else:

View file

@ -13,3 +13,4 @@ class Config:
modalities: List[Literal["image", "audio"]] = field( modalities: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"] default_factory=lambda: ["image", "audio"]
) )
image_use_webp: bool = True # TODO: support files contains images

View file

@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter from ._pptx_converter import PptxConverter
from ._audio_converter import AudioConverter from ._audio_converter import AudioConverter
from ._csv_converter import CsvConverter from ._csv_converter import CsvConverter
from ._image_converter import ImageConverter
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
__all__ = [ __all__ = [
@ -19,6 +20,7 @@ __all__ = [
"_CustomMarkdownify", "_CustomMarkdownify",
"WikipediaConverter", "WikipediaConverter",
"YouTubeConverter", "YouTubeConverter",
"ImageConverter"
"IpynbConverter", "IpynbConverter",
"BingSerpConverter", "BingSerpConverter",
"PdfConverter", "PdfConverter",

View file

@ -29,8 +29,8 @@ class AudioConverter(DocumentConverter):
file_stream, magic_type=stream_info.magic_type) file_stream, magic_type=stream_info.magic_type)
if transcript: if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript md_content += "\n\n### Audio Transcript:\n" + transcript
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
else: else:
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info) return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info, config=self.config)
# Return the result # Return the result

View file

@ -4,13 +4,15 @@ import io
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo from .._schemas import StreamInfo, Config
class CsvConverter(DocumentConverter): class CsvConverter(DocumentConverter):
""" """
Converts CSV files to Markdown tables. Converts CSV files to Markdown tables.
""" """
def __init__(self, config: Config):
self.config = config
def convert( def convert(
self, self,
@ -48,4 +50,4 @@ class CsvConverter(DocumentConverter):
result = "\n".join(markdown_table) result = "\n".join(markdown_table)
return DocumentConverterResult(markdown=result) return DocumentConverterResult(markdown=result, config=self.config)

View file

@ -52,10 +52,11 @@ class HtmlConverter(DocumentConverter):
# remove leading and trailing \n # remove leading and trailing \n
webpage_text = webpage_text.strip() webpage_text = webpage_text.strip()
print(webpage_text)
return DocumentConverterResult( return DocumentConverterResult(
markdown=webpage_text, markdown=webpage_text,
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,
config=self.config,
) )
def convert_string( def convert_string(

View file

@ -0,0 +1,48 @@
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo, Config
import base64
class ImageConverter(DocumentConverter):
"""
Converts image files to markdown with embedded base64 image.
"""
def __init__(self, config: Config):
self.config = config
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Read the image data
image_bytes = file_stream.read()
# Determine image extension from magic_type
image_ext = "png" # Default extension
match stream_info.magic_type:
case "image/jpeg" | "image/jpg":
image_ext = "jpeg"
case "image/png":
image_ext = "png"
case "image/webp":
image_ext = "webp"
if 'image' in self.config.modalities:
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
# Create markdown with embedded image
markdown_content = f"![Image](data:image/{image_ext};base64,{img_base64})\n\n"
return DocumentConverterResult(
markdown=markdown_content,
config=self.config,
)
else:
return DocumentConverterResult(
markdown="No Image read as the supported modalities do not include 'image'",
config=self.config,
)

View file

@ -63,4 +63,5 @@ class PdfConverter(DocumentConverter):
doc.close() doc.close()
return DocumentConverterResult( return DocumentConverterResult(
markdown=markdown_content, markdown=markdown_content,
config=self.config,
) )

View file

@ -116,7 +116,7 @@ class PptxConverter(DocumentConverter):
md_content += notes_frame.text md_content += notes_frame.text
md_content = md_content.strip() md_content = md_content.strip()
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip(), config=self.config)
def _is_picture(self, shape): def _is_picture(self, shape):
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 145 KiB