This commit is contained in:
rong-xyz 2025-04-22 07:00:30 +00:00
parent e729da2b38
commit 03f3fa9829
14 changed files with 66 additions and 47 deletions

View file

@ -7,7 +7,7 @@ from ._markitup import (
MarkItUp,
)
from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo
from ._schemas import StreamInfo, Config
from ._exceptions import (
MarkItUpException,
MissingDependencyException,
@ -27,4 +27,5 @@ __all__ = [
"FileConversionException",
"UnsupportedFormatException",
"StreamInfo",
"Config"
]

View file

@ -2,7 +2,7 @@ import os
import tempfile
from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._stream_info import StreamInfo
from ._schemas import StreamInfo
import re
@ -27,19 +27,18 @@ class DocumentConverterResult:
"""
self.markdown = markdown
self.title = title
def to_llm(self) -> List[Dict[str, Any]]:
"""
Convert markdown with base64 images to a format compatible with OpenAI's API.
This function parses the markdown content, extracting text and images in their
original order, and returns a list of content elements in OpenAI's format.
Returns:
List[Dict[str, Any]]: A list of dictionaries representing the content elements
(text and images) in their original order.
"""
# Pattern to match markdown image syntax with base64 data
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'

View file

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from warnings import warn
import magic
from ._stream_info import StreamInfo
from ._schemas import StreamInfo, Config
from .converters import (
PlainTextConverter,
@ -33,7 +33,7 @@ class MarkItUp:
def __init__(
self,
config: Optional[Dict[str, Any]] = None,
config: Config = Config(),
):
self.config = config
@ -42,10 +42,12 @@ class MarkItUp:
# Deal with unsupported file types
match stream_info.category:
case "ppt":
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
raise UnsupportedFormatException(
".ppt files are not supported, try .pptx instead")
case "other":
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
raise UnsupportedFormatException(
f"{stream_info.magic_type} files are not supported")
try:
match stream_info.category:
case "text":
@ -55,7 +57,8 @@ class MarkItUp:
case "pdf":
return PdfConverter().convert(stream, stream_info), stream_info
except FailedConversionAttempt:
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
raise FileConversionException(
f"Failed to convert file of type {stream_info.magic_type}")
return stream_info
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
@ -97,4 +100,4 @@ class MarkItUp:
category = "other"
byte_stream.seek(original_position)
return StreamInfo(magic_type=magic_type, category=category)
return StreamInfo(magic_type=magic_type, category=category)

View file

@ -0,0 +1,15 @@
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Literal
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None
@dataclass
class Config:
modality: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"]
)

View file

@ -1,8 +0,0 @@
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None

View file

@ -1,6 +1,6 @@
import os
from io import BytesIO
from markitup._stream_info import StreamInfo
from markitup._schemas import StreamInfo
import magic

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
from .._exceptions import MissingDependencyException
ACCEPTED_MIME_TYPE_PREFIXES = [
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
# Transcribe
if audio_format:
try:
transcript = transcribe_audio(file_stream, audio_format=audio_format)
transcript = transcribe_audio(
file_stream, audio_format=audio_format)
if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript
except MissingDependencyException:

View file

@ -5,7 +5,7 @@ from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv",

View file

@ -5,7 +5,7 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
mammoth.convert_to_html(
pre_process_stream, style_map=style_map).value,
**kwargs,
)

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
from ._markdownify import _CustomMarkdownify
ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""
def convert(
self,
file_stream: BinaryIO,
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
) -> DocumentConverterResult:
# Parse the stream
encoding = "utf-8"
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
soup = BeautifulSoup(file_stream, "html.parser",
from_encoding=encoding)
# Remove javascript and style blocks
for script in soup(["script", "style"]):

View file

@ -3,7 +3,7 @@ import io
import base64
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
import fitz
@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter):
) -> DocumentConverterResult:
# Create a document object from the stream
doc = fitz.open(stream=file_stream, filetype="pdf")
# Extract text and images from all pages
markdown_content = ""
image_count = 0
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Get text with the default "text" mode which gives plain text
page_text = page.get_text("text")
# Add page marker
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
markdown_content += page_text + "\n\n"
# Extract images from the page
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
xref = img_info[0] # Get the image reference
base_image = doc.extract_image(xref)
if base_image:
image_bytes = base_image["image"]
image_ext = base_image["ext"]
try:
# Convert image to base64 for markdown embedding
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
img_base64 = base64.b64encode(
image_bytes).decode('utf-8')
# Add image to markdown with a unique identifier
image_count += 1
markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
except Exception as e:
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
# Close the document to free resources
doc.close()
print(markdown_content)
return DocumentConverterResult(
markdown=markdown_content,
)
)

View file

@ -1,11 +1,12 @@
from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
def convert(
self,
file_stream: BinaryIO,

View file

@ -10,7 +10,7 @@ from operator import attrgetter
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
import pptx
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
# Also grab any description embedded in the deck
try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", "")
except Exception:
# Unable to get alt text
pass
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
# Tables
if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
md_content += self._convert_table_to_markdown(
shape.table, **kwargs)
# Charts
if shape.has_chart:
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
# Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
sorted_shapes = sorted(
shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs)
@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
html_table += "</table></body></html>"
return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
self._html_converter.convert_string(
html_table, **kwargs).markdown.strip()
+ "\n"
)

View file

@ -3,7 +3,7 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo
from .._schemas import StreamInfo
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later