support images
This commit is contained in:
parent
cd85971867
commit
46b44d3ebd
7 changed files with 59 additions and 1 deletions
|
|
@ -9,6 +9,7 @@ from ._schemas import StreamInfo, Config
|
||||||
from .converters import (
|
from .converters import (
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
HtmlConverter,
|
HtmlConverter,
|
||||||
|
ImageConverter,
|
||||||
PdfConverter,
|
PdfConverter,
|
||||||
DocxConverter,
|
DocxConverter,
|
||||||
XlsxConverter,
|
XlsxConverter,
|
||||||
|
|
@ -58,11 +59,16 @@ class MarkItUp:
|
||||||
return CsvConverter().convert(stream, stream_info), stream_info
|
return CsvConverter().convert(stream, stream_info), stream_info
|
||||||
case "docx":
|
case "docx":
|
||||||
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "image":
|
||||||
|
return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case _:
|
case _:
|
||||||
match stream_info.category:
|
match stream_info.category:
|
||||||
case "ppt":
|
case "ppt":
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
".ppt files are not supported, try .pptx instead")
|
".ppt files are not supported, try .pptx instead")
|
||||||
|
case "doc":
|
||||||
|
raise UnsupportedFormatException(
|
||||||
|
".doc files are not supported, try .docx instead")
|
||||||
case "other":
|
case "other":
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
f"{stream_info.magic_type} files are not supported")
|
f"{stream_info.magic_type} files are not supported")
|
||||||
|
|
@ -84,7 +90,10 @@ class MarkItUp:
|
||||||
|
|
||||||
# Determine file category based on magic_type
|
# Determine file category based on magic_type
|
||||||
if magic_type.startswith("image/"):
|
if magic_type.startswith("image/"):
|
||||||
category = "image"
|
if magic_type in ["image/webp", "image/jpeg", "image/png", "image/jpg"]:
|
||||||
|
category = "image"
|
||||||
|
else:
|
||||||
|
category = "other"
|
||||||
elif magic_type.startswith("audio/"):
|
elif magic_type.startswith("audio/"):
|
||||||
category = "audio"
|
category = "audio"
|
||||||
elif magic_type.startswith("video/"):
|
elif magic_type.startswith("video/"):
|
||||||
|
|
|
||||||
|
|
@ -13,3 +13,4 @@ class Config:
|
||||||
modalities: List[Literal["image", "audio"]] = field(
|
modalities: List[Literal["image", "audio"]] = field(
|
||||||
default_factory=lambda: ["image", "audio"]
|
default_factory=lambda: ["image", "audio"]
|
||||||
)
|
)
|
||||||
|
image_use_webp: bool = True # TODO: support files contains images
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
from ._audio_converter import AudioConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._csv_converter import CsvConverter
|
from ._csv_converter import CsvConverter
|
||||||
|
from ._image_converter import ImageConverter
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|
@ -19,6 +20,7 @@ __all__ = [
|
||||||
"_CustomMarkdownify",
|
"_CustomMarkdownify",
|
||||||
"WikipediaConverter",
|
"WikipediaConverter",
|
||||||
"YouTubeConverter",
|
"YouTubeConverter",
|
||||||
|
"ImageConverter"
|
||||||
"IpynbConverter",
|
"IpynbConverter",
|
||||||
"BingSerpConverter",
|
"BingSerpConverter",
|
||||||
"PdfConverter",
|
"PdfConverter",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,46 @@
|
||||||
|
from typing import BinaryIO, Any
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._schemas import StreamInfo, Config
|
||||||
|
import base64
|
||||||
|
|
||||||
|
|
||||||
|
class ImageConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Converts image files to markdown with embedded base64 image.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Read the image data
|
||||||
|
image_bytes = file_stream.read()
|
||||||
|
|
||||||
|
# Determine image extension from magic_type
|
||||||
|
image_ext = "png" # Default extension
|
||||||
|
match stream_info.magic_type:
|
||||||
|
case "image/jpeg" | "image/jpg":
|
||||||
|
image_ext = "jpeg"
|
||||||
|
case "image/png":
|
||||||
|
image_ext = "png"
|
||||||
|
case "image/webp":
|
||||||
|
image_ext = "webp"
|
||||||
|
|
||||||
|
if 'image' in self.config.modalities:
|
||||||
|
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
# Create markdown with embedded image
|
||||||
|
markdown_content = f"\n\n"
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
markdown=markdown_content,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return DocumentConverterResult(
|
||||||
|
markdown="No Image read as the supported modalities do not include 'image'",
|
||||||
|
)
|
||||||
BIN
packages/markitup/tests/test_files/test.png
Normal file
BIN
packages/markitup/tests/test_files/test.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.8 MiB |
BIN
packages/markitup/tests/test_files/test.webp
Normal file
BIN
packages/markitup/tests/test_files/test.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 145 KiB |
Loading…
Reference in a new issue