modality
This commit is contained in:
parent
e729da2b38
commit
03f3fa9829
14 changed files with 66 additions and 47 deletions
|
|
@ -7,7 +7,7 @@ from ._markitup import (
|
|||
MarkItUp,
|
||||
)
|
||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||
from ._stream_info import StreamInfo
|
||||
from ._schemas import StreamInfo, Config
|
||||
from ._exceptions import (
|
||||
MarkItUpException,
|
||||
MissingDependencyException,
|
||||
|
|
@ -27,4 +27,5 @@ __all__ = [
|
|||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
"StreamInfo",
|
||||
"Config"
|
||||
]
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import os
|
|||
import tempfile
|
||||
from warnings import warn
|
||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||
from ._stream_info import StreamInfo
|
||||
from ._schemas import StreamInfo
|
||||
import re
|
||||
|
||||
|
||||
|
|
@ -27,19 +27,18 @@ class DocumentConverterResult:
|
|||
"""
|
||||
self.markdown = markdown
|
||||
self.title = title
|
||||
|
||||
|
||||
def to_llm(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert markdown with base64 images to a format compatible with OpenAI's API.
|
||||
|
||||
|
||||
This function parses the markdown content, extracting text and images in their
|
||||
original order, and returns a list of content elements in OpenAI's format.
|
||||
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of dictionaries representing the content elements
|
||||
(text and images) in their original order.
|
||||
"""
|
||||
|
||||
|
||||
# Pattern to match markdown image syntax with base64 data
|
||||
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from urllib.parse import urlparse
|
|||
from warnings import warn
|
||||
import magic
|
||||
|
||||
from ._stream_info import StreamInfo
|
||||
from ._schemas import StreamInfo, Config
|
||||
|
||||
from .converters import (
|
||||
PlainTextConverter,
|
||||
|
|
@ -33,7 +33,7 @@ class MarkItUp:
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
config: Optional[Dict[str, Any]] = None,
|
||||
config: Config = Config(),
|
||||
):
|
||||
self.config = config
|
||||
|
||||
|
|
@ -42,10 +42,12 @@ class MarkItUp:
|
|||
# Deal with unsupported file types
|
||||
match stream_info.category:
|
||||
case "ppt":
|
||||
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
|
||||
raise UnsupportedFormatException(
|
||||
".ppt files are not supported, try .pptx instead")
|
||||
case "other":
|
||||
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
|
||||
|
||||
raise UnsupportedFormatException(
|
||||
f"{stream_info.magic_type} files are not supported")
|
||||
|
||||
try:
|
||||
match stream_info.category:
|
||||
case "text":
|
||||
|
|
@ -55,7 +57,8 @@ class MarkItUp:
|
|||
case "pdf":
|
||||
return PdfConverter().convert(stream, stream_info), stream_info
|
||||
except FailedConversionAttempt:
|
||||
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
|
||||
raise FileConversionException(
|
||||
f"Failed to convert file of type {stream_info.magic_type}")
|
||||
return stream_info
|
||||
|
||||
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
||||
|
|
@ -97,4 +100,4 @@ class MarkItUp:
|
|||
category = "other"
|
||||
|
||||
byte_stream.seek(original_position)
|
||||
return StreamInfo(magic_type=magic_type, category=category)
|
||||
return StreamInfo(magic_type=magic_type, category=category)
|
||||
|
|
|
|||
15
packages/markitup/src/markitup/_schemas.py
Normal file
15
packages/markitup/src/markitup/_schemas.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
from dataclasses import dataclass, asdict, field
|
||||
from typing import Optional, List, Literal
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamInfo:
|
||||
magic_type: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
modality: List[Literal["image", "audio"]] = field(
|
||||
default_factory=lambda: ["image", "audio"]
|
||||
)
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamInfo:
|
||||
magic_type: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import os
|
||||
from io import BytesIO
|
||||
from markitup._stream_info import StreamInfo
|
||||
from markitup._schemas import StreamInfo
|
||||
import magic
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
|
|||
|
||||
from ._exiftool import exiftool_metadata
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
|
|
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
|
|||
# Transcribe
|
||||
if audio_format:
|
||||
try:
|
||||
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
||||
transcript = transcribe_audio(
|
||||
file_stream, audio_format=audio_format)
|
||||
if transcript:
|
||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||
except MissingDependencyException:
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
|
|||
from charset_normalizer import from_bytes
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/csv",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
|
|||
from ._html_converter import HtmlConverter
|
||||
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
|
|
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
|
|||
style_map = kwargs.get("style_map", None)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
return self._html_converter.convert_string(
|
||||
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
|
||||
mammoth.convert_to_html(
|
||||
pre_process_stream, style_map=style_map).value,
|
||||
**kwargs,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
||||
|
|
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
|
|||
|
||||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
|
|||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8"
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
soup = BeautifulSoup(file_stream, "html.parser",
|
||||
from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import io
|
|||
import base64
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
|
||||
import fitz
|
||||
|
||||
|
|
@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter):
|
|||
) -> DocumentConverterResult:
|
||||
# Create a document object from the stream
|
||||
doc = fitz.open(stream=file_stream, filetype="pdf")
|
||||
|
||||
|
||||
# Extract text and images from all pages
|
||||
markdown_content = ""
|
||||
image_count = 0
|
||||
for page_num in range(len(doc)):
|
||||
page = doc.load_page(page_num)
|
||||
|
||||
|
||||
# Get text with the default "text" mode which gives plain text
|
||||
page_text = page.get_text("text")
|
||||
# Add page marker
|
||||
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
||||
markdown_content += page_text + "\n\n"
|
||||
|
||||
|
||||
# Extract images from the page
|
||||
image_list = page.get_images(full=True)
|
||||
|
||||
|
||||
for img_index, img_info in enumerate(image_list):
|
||||
xref = img_info[0] # Get the image reference
|
||||
base_image = doc.extract_image(xref)
|
||||
|
||||
|
||||
if base_image:
|
||||
image_bytes = base_image["image"]
|
||||
image_ext = base_image["ext"]
|
||||
|
||||
|
||||
try:
|
||||
# Convert image to base64 for markdown embedding
|
||||
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||
img_base64 = base64.b64encode(
|
||||
image_bytes).decode('utf-8')
|
||||
# Add image to markdown with a unique identifier
|
||||
image_count += 1
|
||||
markdown_content += f"\n\n"
|
||||
except Exception as e:
|
||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||
|
||||
|
||||
# Close the document to free resources
|
||||
doc.close()
|
||||
print(markdown_content)
|
||||
return DocumentConverterResult(
|
||||
markdown=markdown_content,
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,11 +1,12 @@
|
|||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from operator import attrgetter
|
|||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
import pptx
|
||||
|
||||
|
||||
|
|
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
|
|||
|
||||
# Also grab any description embedded in the deck
|
||||
try:
|
||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
||||
"descr", "")
|
||||
except Exception:
|
||||
# Unable to get alt text
|
||||
pass
|
||||
|
|
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
|
|||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
|
||||
|
||||
# Tables
|
||||
if self._is_table(shape):
|
||||
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
||||
md_content += self._convert_table_to_markdown(
|
||||
shape.table, **kwargs)
|
||||
|
||||
# Charts
|
||||
if shape.has_chart:
|
||||
|
|
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
|
|||
|
||||
# Group Shapes
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
||||
sorted_shapes = sorted(
|
||||
shape.shapes, key=attrgetter("top", "left"))
|
||||
for subshape in sorted_shapes:
|
||||
get_shape_content(subshape, **kwargs)
|
||||
|
||||
|
|
@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
|
|||
html_table += "</table></body></html>"
|
||||
|
||||
return (
|
||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
||||
self._html_converter.convert_string(
|
||||
html_table, **kwargs).markdown.strip()
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from typing import BinaryIO, Any
|
|||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
from .._stream_info import StreamInfo
|
||||
from .._schemas import StreamInfo
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
|
|
|
|||
Loading…
Reference in a new issue