support images

This commit is contained in:
rong-xyz 2025-04-23 06:13:19 +00:00
parent cd85971867
commit 46b44d3ebd
7 changed files with 59 additions and 1 deletions

View file

@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config
from .converters import (
PlainTextConverter,
HtmlConverter,
ImageConverter,
PdfConverter,
DocxConverter,
XlsxConverter,
@ -58,11 +59,16 @@ class MarkItUp:
return CsvConverter().convert(stream, stream_info), stream_info
case "docx":
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
case "image":
return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
case _:
match stream_info.category:
case "ppt":
raise UnsupportedFormatException(
".ppt files are not supported, try .pptx instead")
case "doc":
raise UnsupportedFormatException(
".doc files are not supported, try .docx instead")
case "other":
raise UnsupportedFormatException(
f"{stream_info.magic_type} files are not supported")
@ -84,7 +90,10 @@ class MarkItUp:
# Determine file category based on magic_type
if magic_type.startswith("image/"):
if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]:
category = "image"
else:
category = "other"
elif magic_type.startswith("audio/"):
category = "audio"
elif magic_type.startswith("video/"):

View file

@ -13,3 +13,4 @@ class Config:
modalities: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"]
)
image_use_webp: bool = True # TODO: support files contains images

View file

@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._audio_converter import AudioConverter
from ._csv_converter import CsvConverter
from ._image_converter import ImageConverter
from ._markdownify import _CustomMarkdownify
__all__ = [
@ -19,6 +20,7 @@ __all__ = [
"_CustomMarkdownify",
"WikipediaConverter",
"YouTubeConverter",
"ImageConverter"
"IpynbConverter",
"BingSerpConverter",
"PdfConverter",

View file

@ -0,0 +1,46 @@
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo, Config
import base64
class ImageConverter(DocumentConverter):
"""
Converts image files to markdown with embedded base64 image.
"""
def __init__(self, config: Config):
self.config = config
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Read the image data
image_bytes = file_stream.read()
# Determine image extension from magic_type
image_ext = "png" # Default extension
match stream_info.magic_type:
case "image/jpeg" | "image/jpg":
image_ext = "jpeg"
case "image/png":
image_ext = "png"
case "image/webp":
image_ext = "webp"
if 'image' in self.config.modalities:
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
# Create markdown with embedded image
markdown_content = f"![Image](data:image/{image_ext};base64,{img_base64})\n\n"
return DocumentConverterResult(
markdown=markdown_content,
)
else:
return DocumentConverterResult(
markdown="No Image read as the supported modalities do not include 'image'",
)

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 145 KiB