support images
This commit is contained in:
parent
cd85971867
commit
46b44d3ebd
7 changed files with 59 additions and 1 deletions
|
|
@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config
|
|||
from .converters import (
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
ImageConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
XlsxConverter,
|
||||
|
|
@ -58,11 +59,16 @@ class MarkItUp:
|
|||
return CsvConverter().convert(stream, stream_info), stream_info
|
||||
case "docx":
|
||||
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "image":
|
||||
return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case _:
|
||||
match stream_info.category:
|
||||
case "ppt":
|
||||
raise UnsupportedFormatException(
|
||||
".ppt files are not supported, try .pptx instead")
|
||||
case "doc":
|
||||
raise UnsupportedFormatException(
|
||||
".doc files are not supported, try .docx instead")
|
||||
case "other":
|
||||
raise UnsupportedFormatException(
|
||||
f"{stream_info.magic_type} files are not supported")
|
||||
|
|
@ -84,7 +90,10 @@ class MarkItUp:
|
|||
|
||||
# Determine file category based on magic_type
|
||||
if magic_type.startswith("image/"):
|
||||
category = "image"
|
||||
if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]:
|
||||
category = "image"
|
||||
else:
|
||||
category = "other"
|
||||
elif magic_type.startswith("audio/"):
|
||||
category = "audio"
|
||||
elif magic_type.startswith("video/"):
|
||||
|
|
|
|||
|
|
@ -13,3 +13,4 @@ class Config:
|
|||
modalities: List[Literal["image", "audio"]] = field(
|
||||
default_factory=lambda: ["image", "audio"]
|
||||
)
|
||||
image_use_webp: bool = True # TODO: support files contains images
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter
|
|||
from ._pptx_converter import PptxConverter
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._csv_converter import CsvConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -19,6 +20,7 @@ __all__ = [
|
|||
"_CustomMarkdownify",
|
||||
"WikipediaConverter",
|
||||
"YouTubeConverter",
|
||||
"ImageConverter"
|
||||
"IpynbConverter",
|
||||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,46 @@
|
|||
from typing import BinaryIO, Any
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo, Config
|
||||
import base64
|
||||
|
||||
|
||||
class ImageConverter(DocumentConverter):
|
||||
"""
|
||||
Converts image files to markdown with embedded base64 image.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the image data
|
||||
image_bytes = file_stream.read()
|
||||
|
||||
# Determine image extension from magic_type
|
||||
image_ext = "png" # Default extension
|
||||
match stream_info.magic_type:
|
||||
case "image/jpeg" | "image/jpg":
|
||||
image_ext = "jpeg"
|
||||
case "image/png":
|
||||
image_ext = "png"
|
||||
case "image/webp":
|
||||
image_ext = "webp"
|
||||
|
||||
if 'image' in self.config.modalities:
|
||||
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||
|
||||
# Create markdown with embedded image
|
||||
markdown_content = f"\n\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=markdown_content,
|
||||
)
|
||||
else:
|
||||
return DocumentConverterResult(
|
||||
markdown="No Image read as the supported modalities do not include 'image'",
|
||||
)
|
||||
BIN
packages/markitup/tests/test_files/test.png
Normal file
BIN
packages/markitup/tests/test_files/test.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.8 MiB |
BIN
packages/markitup/tests/test_files/test.webp
Normal file
BIN
packages/markitup/tests/test_files/test.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 145 KiB |
Loading…
Reference in a new issue