modality
This commit is contained in:
parent
e729da2b38
commit
03f3fa9829
14 changed files with 66 additions and 47 deletions
|
|
@ -7,7 +7,7 @@ from ._markitup import (
|
||||||
MarkItUp,
|
MarkItUp,
|
||||||
)
|
)
|
||||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||||
from ._stream_info import StreamInfo
|
from ._schemas import StreamInfo, Config
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItUpException,
|
MarkItUpException,
|
||||||
MissingDependencyException,
|
MissingDependencyException,
|
||||||
|
|
@ -27,4 +27,5 @@ __all__ = [
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
"StreamInfo",
|
"StreamInfo",
|
||||||
|
"Config"
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||||
from ._stream_info import StreamInfo
|
from ._schemas import StreamInfo
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -27,19 +27,18 @@ class DocumentConverterResult:
|
||||||
"""
|
"""
|
||||||
self.markdown = markdown
|
self.markdown = markdown
|
||||||
self.title = title
|
self.title = title
|
||||||
|
|
||||||
def to_llm(self) -> List[Dict[str, Any]]:
|
def to_llm(self) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Convert markdown with base64 images to a format compatible with OpenAI's API.
|
Convert markdown with base64 images to a format compatible with OpenAI's API.
|
||||||
|
|
||||||
This function parses the markdown content, extracting text and images in their
|
This function parses the markdown content, extracting text and images in their
|
||||||
original order, and returns a list of content elements in OpenAI's format.
|
original order, and returns a list of content elements in OpenAI's format.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Dict[str, Any]]: A list of dictionaries representing the content elements
|
List[Dict[str, Any]]: A list of dictionaries representing the content elements
|
||||||
(text and images) in their original order.
|
(text and images) in their original order.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# Pattern to match markdown image syntax with base64 data
|
# Pattern to match markdown image syntax with base64 data
|
||||||
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
|
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
from ._stream_info import StreamInfo
|
from ._schemas import StreamInfo, Config
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
|
|
@ -33,7 +33,7 @@ class MarkItUp:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: Optional[Dict[str, Any]] = None,
|
config: Config = Config(),
|
||||||
):
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
|
|
@ -42,10 +42,12 @@ class MarkItUp:
|
||||||
# Deal with unsupported file types
|
# Deal with unsupported file types
|
||||||
match stream_info.category:
|
match stream_info.category:
|
||||||
case "ppt":
|
case "ppt":
|
||||||
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
|
raise UnsupportedFormatException(
|
||||||
|
".ppt files are not supported, try .pptx instead")
|
||||||
case "other":
|
case "other":
|
||||||
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
|
raise UnsupportedFormatException(
|
||||||
|
f"{stream_info.magic_type} files are not supported")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
match stream_info.category:
|
match stream_info.category:
|
||||||
case "text":
|
case "text":
|
||||||
|
|
@ -55,7 +57,8 @@ class MarkItUp:
|
||||||
case "pdf":
|
case "pdf":
|
||||||
return PdfConverter().convert(stream, stream_info), stream_info
|
return PdfConverter().convert(stream, stream_info), stream_info
|
||||||
except FailedConversionAttempt:
|
except FailedConversionAttempt:
|
||||||
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
|
raise FileConversionException(
|
||||||
|
f"Failed to convert file of type {stream_info.magic_type}")
|
||||||
return stream_info
|
return stream_info
|
||||||
|
|
||||||
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
||||||
|
|
@ -97,4 +100,4 @@ class MarkItUp:
|
||||||
category = "other"
|
category = "other"
|
||||||
|
|
||||||
byte_stream.seek(original_position)
|
byte_stream.seek(original_position)
|
||||||
return StreamInfo(magic_type=magic_type, category=category)
|
return StreamInfo(magic_type=magic_type, category=category)
|
||||||
|
|
|
||||||
15
packages/markitup/src/markitup/_schemas.py
Normal file
15
packages/markitup/src/markitup/_schemas.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
from typing import Optional, List, Literal
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StreamInfo:
|
||||||
|
magic_type: Optional[str] = None
|
||||||
|
category: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Config:
|
||||||
|
modality: List[Literal["image", "audio"]] = field(
|
||||||
|
default_factory=lambda: ["image", "audio"]
|
||||||
|
)
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
from dataclasses import dataclass, asdict
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class StreamInfo:
|
|
||||||
magic_type: Optional[str] = None
|
|
||||||
category: Optional[str] = None
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import os
|
import os
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from markitup._stream_info import StreamInfo
|
from markitup._schemas import StreamInfo
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
|
||||||
|
|
||||||
from ._exiftool import exiftool_metadata
|
from ._exiftool import exiftool_metadata
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
|
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
|
||||||
# Transcribe
|
# Transcribe
|
||||||
if audio_format:
|
if audio_format:
|
||||||
try:
|
try:
|
||||||
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
transcript = transcribe_audio(
|
||||||
|
file_stream, audio_format=audio_format)
|
||||||
if transcript:
|
if transcript:
|
||||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||||
except MissingDependencyException:
|
except MissingDependencyException:
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
"text/csv",
|
"text/csv",
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ..converter_utils.docx.pre_process import pre_process_docx
|
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
pre_process_stream = pre_process_docx(file_stream)
|
pre_process_stream = pre_process_docx(file_stream)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
|
mammoth.convert_to_html(
|
||||||
|
pre_process_stream, style_map=style_map).value,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
||||||
|
|
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
|
||||||
|
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
soup = BeautifulSoup(file_stream, "html.parser",
|
||||||
|
from_encoding=encoding)
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import io
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
|
|
@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter):
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Create a document object from the stream
|
# Create a document object from the stream
|
||||||
doc = fitz.open(stream=file_stream, filetype="pdf")
|
doc = fitz.open(stream=file_stream, filetype="pdf")
|
||||||
|
|
||||||
# Extract text and images from all pages
|
# Extract text and images from all pages
|
||||||
markdown_content = ""
|
markdown_content = ""
|
||||||
image_count = 0
|
image_count = 0
|
||||||
for page_num in range(len(doc)):
|
for page_num in range(len(doc)):
|
||||||
page = doc.load_page(page_num)
|
page = doc.load_page(page_num)
|
||||||
|
|
||||||
# Get text with the default "text" mode which gives plain text
|
# Get text with the default "text" mode which gives plain text
|
||||||
page_text = page.get_text("text")
|
page_text = page.get_text("text")
|
||||||
# Add page marker
|
# Add page marker
|
||||||
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
||||||
markdown_content += page_text + "\n\n"
|
markdown_content += page_text + "\n\n"
|
||||||
|
|
||||||
# Extract images from the page
|
# Extract images from the page
|
||||||
image_list = page.get_images(full=True)
|
image_list = page.get_images(full=True)
|
||||||
|
|
||||||
for img_index, img_info in enumerate(image_list):
|
for img_index, img_info in enumerate(image_list):
|
||||||
xref = img_info[0] # Get the image reference
|
xref = img_info[0] # Get the image reference
|
||||||
base_image = doc.extract_image(xref)
|
base_image = doc.extract_image(xref)
|
||||||
|
|
||||||
if base_image:
|
if base_image:
|
||||||
image_bytes = base_image["image"]
|
image_bytes = base_image["image"]
|
||||||
image_ext = base_image["ext"]
|
image_ext = base_image["ext"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Convert image to base64 for markdown embedding
|
# Convert image to base64 for markdown embedding
|
||||||
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
img_base64 = base64.b64encode(
|
||||||
|
image_bytes).decode('utf-8')
|
||||||
# Add image to markdown with a unique identifier
|
# Add image to markdown with a unique identifier
|
||||||
image_count += 1
|
image_count += 1
|
||||||
markdown_content += f"\n\n"
|
markdown_content += f"\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||||
|
|
||||||
# Close the document to free resources
|
# Close the document to free resources
|
||||||
doc.close()
|
doc.close()
|
||||||
print(markdown_content)
|
print(markdown_content)
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=markdown_content,
|
markdown=markdown_content,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,12 @@
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from operator import attrgetter
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
import pptx
|
import pptx
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
|
||||||
|
|
||||||
# Also grab any description embedded in the deck
|
# Also grab any description embedded in the deck
|
||||||
try:
|
try:
|
||||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
||||||
|
"descr", "")
|
||||||
except Exception:
|
except Exception:
|
||||||
# Unable to get alt text
|
# Unable to get alt text
|
||||||
pass
|
pass
|
||||||
|
|
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
|
||||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
md_content += f"\n\n"
|
md_content += f"\n\n"
|
||||||
|
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
md_content += self._convert_table_to_markdown(
|
||||||
|
shape.table, **kwargs)
|
||||||
|
|
||||||
# Charts
|
# Charts
|
||||||
if shape.has_chart:
|
if shape.has_chart:
|
||||||
|
|
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
|
||||||
|
|
||||||
# Group Shapes
|
# Group Shapes
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||||
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
sorted_shapes = sorted(
|
||||||
|
shape.shapes, key=attrgetter("top", "left"))
|
||||||
for subshape in sorted_shapes:
|
for subshape in sorted_shapes:
|
||||||
get_shape_content(subshape, **kwargs)
|
get_shape_content(subshape, **kwargs)
|
||||||
|
|
||||||
|
|
@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
|
||||||
html_table += "</table></body></html>"
|
html_table += "</table></body></html>"
|
||||||
|
|
||||||
return (
|
return (
|
||||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
self._html_converter.convert_string(
|
||||||
|
html_table, **kwargs).markdown.strip()
|
||||||
+ "\n"
|
+ "\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from typing import BinaryIO, Any
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue