This commit is contained in:
rong-xyz 2025-04-22 07:00:30 +00:00
parent e729da2b38
commit 03f3fa9829
14 changed files with 66 additions and 47 deletions

View file

@ -7,7 +7,7 @@ from ._markitup import (
MarkItUp, MarkItUp,
) )
from ._base_converter import DocumentConverterResult, DocumentConverter from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo from ._schemas import StreamInfo, Config
from ._exceptions import ( from ._exceptions import (
MarkItUpException, MarkItUpException,
MissingDependencyException, MissingDependencyException,
@ -27,4 +27,5 @@ __all__ = [
"FileConversionException", "FileConversionException",
"UnsupportedFormatException", "UnsupportedFormatException",
"StreamInfo", "StreamInfo",
"Config"
] ]

View file

@ -2,7 +2,7 @@ import os
import tempfile import tempfile
from warnings import warn from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._stream_info import StreamInfo from ._schemas import StreamInfo
import re import re
@ -27,19 +27,18 @@ class DocumentConverterResult:
""" """
self.markdown = markdown self.markdown = markdown
self.title = title self.title = title
def to_llm(self) -> List[Dict[str, Any]]: def to_llm(self) -> List[Dict[str, Any]]:
""" """
Convert markdown with base64 images to a format compatible with OpenAI's API. Convert markdown with base64 images to a format compatible with OpenAI's API.
This function parses the markdown content, extracting text and images in their This function parses the markdown content, extracting text and images in their
original order, and returns a list of content elements in OpenAI's format. original order, and returns a list of content elements in OpenAI's format.
Returns: Returns:
List[Dict[str, Any]]: A list of dictionaries representing the content elements List[Dict[str, Any]]: A list of dictionaries representing the content elements
(text and images) in their original order. (text and images) in their original order.
""" """
# Pattern to match markdown image syntax with base64 data # Pattern to match markdown image syntax with base64 data
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)' pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'

View file

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from warnings import warn from warnings import warn
import magic import magic
from ._stream_info import StreamInfo from ._schemas import StreamInfo, Config
from .converters import ( from .converters import (
PlainTextConverter, PlainTextConverter,
@ -33,7 +33,7 @@ class MarkItUp:
def __init__( def __init__(
self, self,
config: Optional[Dict[str, Any]] = None, config: Config = Config(),
): ):
self.config = config self.config = config
@ -42,10 +42,12 @@ class MarkItUp:
# Deal with unsupported file types # Deal with unsupported file types
match stream_info.category: match stream_info.category:
case "ppt": case "ppt":
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead") raise UnsupportedFormatException(
".ppt files are not supported, try .pptx instead")
case "other": case "other":
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported") raise UnsupportedFormatException(
f"{stream_info.magic_type} files are not supported")
try: try:
match stream_info.category: match stream_info.category:
case "text": case "text":
@ -55,7 +57,8 @@ class MarkItUp:
case "pdf": case "pdf":
return PdfConverter().convert(stream, stream_info), stream_info return PdfConverter().convert(stream, stream_info), stream_info
except FailedConversionAttempt: except FailedConversionAttempt:
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}") raise FileConversionException(
f"Failed to convert file of type {stream_info.magic_type}")
return stream_info return stream_info
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
@ -97,4 +100,4 @@ class MarkItUp:
category = "other" category = "other"
byte_stream.seek(original_position) byte_stream.seek(original_position)
return StreamInfo(magic_type=magic_type, category=category) return StreamInfo(magic_type=magic_type, category=category)

View file

@ -0,0 +1,15 @@
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Literal
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None
@dataclass
class Config:
modality: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"]
)

View file

@ -1,8 +0,0 @@
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None

View file

@ -1,6 +1,6 @@
import os import os
from io import BytesIO from io import BytesIO
from markitup._stream_info import StreamInfo from markitup._schemas import StreamInfo
import magic import magic

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from ._exiftool import exiftool_metadata from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
from .._exceptions import MissingDependencyException from .._exceptions import MissingDependencyException
ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_MIME_TYPE_PREFIXES = [
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
# Transcribe # Transcribe
if audio_format: if audio_format:
try: try:
transcript = transcribe_audio(file_stream, audio_format=audio_format) transcript = transcribe_audio(
file_stream, audio_format=audio_format)
if transcript: if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript md_content += "\n\n### Audio Transcript:\n" + transcript
except MissingDependencyException: except MissingDependencyException:

View file

@ -5,7 +5,7 @@ from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv", "text/csv",

View file

@ -5,7 +5,7 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream) pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string( return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, mammoth.convert_to_html(
pre_process_stream, style_map=style_map).value,
**kwargs, **kwargs,
) )

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
ACCEPTED_MAGIC_TYPE_PREFIXES = [ ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
encoding = "utf-8" encoding = "utf-8"
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) soup = BeautifulSoup(file_stream, "html.parser",
from_encoding=encoding)
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):

View file

@ -3,7 +3,7 @@ import io
import base64 import base64
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
import fitz import fitz
@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter):
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Create a document object from the stream # Create a document object from the stream
doc = fitz.open(stream=file_stream, filetype="pdf") doc = fitz.open(stream=file_stream, filetype="pdf")
# Extract text and images from all pages # Extract text and images from all pages
markdown_content = "" markdown_content = ""
image_count = 0 image_count = 0
for page_num in range(len(doc)): for page_num in range(len(doc)):
page = doc.load_page(page_num) page = doc.load_page(page_num)
# Get text with the default "text" mode which gives plain text # Get text with the default "text" mode which gives plain text
page_text = page.get_text("text") page_text = page.get_text("text")
# Add page marker # Add page marker
markdown_content += f"\n\n## Page {page_num + 1}\n\n" markdown_content += f"\n\n## Page {page_num + 1}\n\n"
markdown_content += page_text + "\n\n" markdown_content += page_text + "\n\n"
# Extract images from the page # Extract images from the page
image_list = page.get_images(full=True) image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list): for img_index, img_info in enumerate(image_list):
xref = img_info[0] # Get the image reference xref = img_info[0] # Get the image reference
base_image = doc.extract_image(xref) base_image = doc.extract_image(xref)
if base_image: if base_image:
image_bytes = base_image["image"] image_bytes = base_image["image"]
image_ext = base_image["ext"] image_ext = base_image["ext"]
try: try:
# Convert image to base64 for markdown embedding # Convert image to base64 for markdown embedding
img_base64 = base64.b64encode(image_bytes).decode('utf-8') img_base64 = base64.b64encode(
image_bytes).decode('utf-8')
# Add image to markdown with a unique identifier # Add image to markdown with a unique identifier
image_count += 1 image_count += 1
markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
except Exception as e: except Exception as e:
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
# Close the document to free resources # Close the document to free resources
doc.close() doc.close()
print(markdown_content) print(markdown_content)
return DocumentConverterResult( return DocumentConverterResult(
markdown=markdown_content, markdown=markdown_content,
) )

View file

@ -1,11 +1,12 @@
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

View file

@ -10,7 +10,7 @@ from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
import pptx import pptx
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
# Also grab any description embedded in the deck # Also grab any description embedded in the deck
try: try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", "")
except Exception: except Exception:
# Unable to get alt text # Unable to get alt text
pass pass
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
b64_string = base64.b64encode(blob).decode("utf-8") b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table, **kwargs) md_content += self._convert_table_to_markdown(
shape.table, **kwargs)
# Charts # Charts
if shape.has_chart: if shape.has_chart:
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) sorted_shapes = sorted(
shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes: for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
html_table += "</table></body></html>" html_table += "</table></body></html>"
return ( return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip() self._html_converter.convert_string(
html_table, **kwargs).markdown.strip()
+ "\n" + "\n"
) )

View file

@ -3,7 +3,7 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo from .._schemas import StreamInfo
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later