From 03f3fa98290f6c67828de85cc642a01fcf572b21 Mon Sep 17 00:00:00 2001 From: rong-xyz Date: Tue, 22 Apr 2025 07:00:30 +0000 Subject: [PATCH] modality --- packages/markitup/src/markitup/__init__.py | 3 ++- .../markitup/src/markitup/_base_converter.py | 9 ++++---- packages/markitup/src/markitup/_markitup.py | 17 ++++++++------- packages/markitup/src/markitup/_schemas.py | 15 +++++++++++++ .../markitup/src/markitup/_stream_info.py | 8 ------- .../src/markitup/converter_utils/utils.py | 2 +- .../markitup/converters/_audio_converter.py | 5 +++-- .../src/markitup/converters/_csv_converter.py | 2 +- .../markitup/converters/_docx_converter.py | 5 +++-- .../markitup/converters/_html_converter.py | 6 ++++-- .../src/markitup/converters/_pdf_converter.py | 21 ++++++++++--------- .../converters/_plain_text_converter.py | 3 ++- .../markitup/converters/_pptx_converter.py | 15 +++++++------ .../markitup/converters/_xlsx_converter.py | 2 +- 14 files changed, 66 insertions(+), 47 deletions(-) create mode 100644 packages/markitup/src/markitup/_schemas.py delete mode 100644 packages/markitup/src/markitup/_stream_info.py diff --git a/packages/markitup/src/markitup/__init__.py b/packages/markitup/src/markitup/__init__.py index aef329a..ba22cbc 100644 --- a/packages/markitup/src/markitup/__init__.py +++ b/packages/markitup/src/markitup/__init__.py @@ -7,7 +7,7 @@ from ._markitup import ( MarkItUp, ) from ._base_converter import DocumentConverterResult, DocumentConverter -from ._stream_info import StreamInfo +from ._schemas import StreamInfo, Config from ._exceptions import ( MarkItUpException, MissingDependencyException, @@ -27,4 +27,5 @@ __all__ = [ "FileConversionException", "UnsupportedFormatException", "StreamInfo", + "Config" ] diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py index cdabef9..9de88df 100644 --- a/packages/markitup/src/markitup/_base_converter.py +++ b/packages/markitup/src/markitup/_base_converter.py @@ -2,7 +2,7 @@ import os import tempfile from warnings import warn from typing import Any, Union, BinaryIO, Optional, List, Dict -from ._stream_info import StreamInfo +from ._schemas import StreamInfo import re @@ -27,19 +27,18 @@ class DocumentConverterResult: """ self.markdown = markdown self.title = title - + def to_llm(self) -> List[Dict[str, Any]]: """ Convert markdown with base64 images to a format compatible with OpenAI's API. - + This function parses the markdown content, extracting text and images in their original order, and returns a list of content elements in OpenAI's format. - + Returns: List[Dict[str, Any]]: A list of dictionaries representing the content elements (text and images) in their original order. """ - # Pattern to match markdown image syntax with base64 data pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)' diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index c2fb0a2..0b5ddf0 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse from warnings import warn import magic -from ._stream_info import StreamInfo +from ._schemas import StreamInfo, Config from .converters import ( PlainTextConverter, @@ -33,7 +33,7 @@ class MarkItUp: def __init__( self, - config: Optional[Dict[str, Any]] = None, + config: Config = Config(), ): self.config = config @@ -42,10 +42,12 @@ class MarkItUp: # Deal with unsupported file types match stream_info.category: case "ppt": - raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead") + raise UnsupportedFormatException( + ".ppt files are not supported, try .pptx instead") case "other": - raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported") - + raise UnsupportedFormatException( + f"{stream_info.magic_type} files are not supported") + try: match stream_info.category: case "text": @@ -55,7 +57,8 @@ class MarkItUp: case "pdf": return PdfConverter().convert(stream, stream_info), stream_info except FailedConversionAttempt: - raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}") + raise FileConversionException( + f"Failed to convert file of type {stream_info.magic_type}") return stream_info def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: @@ -97,4 +100,4 @@ class MarkItUp: category = "other" byte_stream.seek(original_position) - return StreamInfo(magic_type=magic_type, category=category) \ No newline at end of file + return StreamInfo(magic_type=magic_type, category=category) diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py new file mode 100644 index 0000000..ecfce92 --- /dev/null +++ b/packages/markitup/src/markitup/_schemas.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass, asdict, field +from typing import Optional, List, Literal + + +@dataclass +class StreamInfo: + magic_type: Optional[str] = None + category: Optional[str] = None + + +@dataclass +class Config: + modality: List[Literal["image", "audio"]] = field( + default_factory=lambda: ["image", "audio"] + ) diff --git a/packages/markitup/src/markitup/_stream_info.py b/packages/markitup/src/markitup/_stream_info.py deleted file mode 100644 index 66e8c72..0000000 --- a/packages/markitup/src/markitup/_stream_info.py +++ /dev/null @@ -1,8 +0,0 @@ -from dataclasses import dataclass, asdict -from typing import Optional - - -@dataclass -class StreamInfo: - magic_type: Optional[str] = None - category: Optional[str] = None \ No newline at end of file diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py index 8d5df3d..12e533d 100644 --- a/packages/markitup/src/markitup/converter_utils/utils.py +++ b/packages/markitup/src/markitup/converter_utils/utils.py @@ -1,6 +1,6 @@ import os from io import BytesIO -from markitup._stream_info import StreamInfo +from markitup._schemas import StreamInfo import magic diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py index 57a828d..eeff58e 100644 --- a/packages/markitup/src/markitup/converters/_audio_converter.py +++ b/packages/markitup/src/markitup/converters/_audio_converter.py @@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo from .._exceptions import MissingDependencyException ACCEPTED_MIME_TYPE_PREFIXES = [ @@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter): # Transcribe if audio_format: try: - transcript = transcribe_audio(file_stream, audio_format=audio_format) + transcript = transcribe_audio( + file_stream, audio_format=audio_format) if transcript: md_content += "\n\n### Audio Transcript:\n" + transcript except MissingDependencyException: diff --git a/packages/markitup/src/markitup/converters/_csv_converter.py b/packages/markitup/src/markitup/converters/_csv_converter.py index 7162889..78963ed 100644 --- a/packages/markitup/src/markitup/converters/_csv_converter.py +++ b/packages/markitup/src/markitup/converters/_csv_converter.py @@ -5,7 +5,7 @@ from typing import BinaryIO, Any from charset_normalizer import from_bytes from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo ACCEPTED_MIME_TYPE_PREFIXES = [ "text/csv", diff --git a/packages/markitup/src/markitup/converters/_docx_converter.py b/packages/markitup/src/markitup/converters/_docx_converter.py index b320695..0db97a9 100644 --- a/packages/markitup/src/markitup/converters/_docx_converter.py +++ b/packages/markitup/src/markitup/converters/_docx_converter.py @@ -5,7 +5,7 @@ from typing import BinaryIO, Any from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + mammoth.convert_to_html( + pre_process_stream, style_map=style_map).value, **kwargs, ) diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py index b85a68d..91db39a 100644 --- a/packages/markitup/src/markitup/converters/_html_converter.py +++ b/packages/markitup/src/markitup/converters/_html_converter.py @@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo from ._markdownify import _CustomMarkdownify ACCEPTED_MAGIC_TYPE_PREFIXES = [ @@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [ class HtmlConverter(DocumentConverter): """Anything with content type text/html""" + def convert( self, file_stream: BinaryIO, @@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter): ) -> DocumentConverterResult: # Parse the stream encoding = "utf-8" - soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + soup = BeautifulSoup(file_stream, "html.parser", + from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]): diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py index 3f64f42..0794c8a 100644 --- a/packages/markitup/src/markitup/converters/_pdf_converter.py +++ b/packages/markitup/src/markitup/converters/_pdf_converter.py @@ -3,7 +3,7 @@ import io import base64 from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo import fitz @@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter): ) -> DocumentConverterResult: # Create a document object from the stream doc = fitz.open(stream=file_stream, filetype="pdf") - + # Extract text and images from all pages markdown_content = "" image_count = 0 for page_num in range(len(doc)): page = doc.load_page(page_num) - + # Get text with the default "text" mode which gives plain text page_text = page.get_text("text") # Add page marker markdown_content += f"\n\n## Page {page_num + 1}\n\n" markdown_content += page_text + "\n\n" - + # Extract images from the page image_list = page.get_images(full=True) - + for img_index, img_info in enumerate(image_list): xref = img_info[0] # Get the image reference base_image = doc.extract_image(xref) - + if base_image: image_bytes = base_image["image"] image_ext = base_image["ext"] - + try: # Convert image to base64 for markdown embedding - img_base64 = base64.b64encode(image_bytes).decode('utf-8') + img_base64 = base64.b64encode( + image_bytes).decode('utf-8') # Add image to markdown with a unique identifier image_count += 1 markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" except Exception as e: markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" - + # Close the document to free resources doc.close() print(markdown_content) return DocumentConverterResult( markdown=markdown_content, - ) \ No newline at end of file + ) diff --git a/packages/markitup/src/markitup/converters/_plain_text_converter.py b/packages/markitup/src/markitup/converters/_plain_text_converter.py index b7f776e..740a4f7 100644 --- a/packages/markitup/src/markitup/converters/_plain_text_converter.py +++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py @@ -1,11 +1,12 @@ from typing import BinaryIO, Any from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" + def convert( self, file_stream: BinaryIO, diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index f1c112b..3ee4595 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -10,7 +10,7 @@ from operator import attrgetter from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo import pptx @@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter): # Also grab any description embedded in the deck try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + alt_text = shape._element._nvXxPr.cNvPr.attrib.get( + "descr", "") except Exception: # Unable to get alt text pass @@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter): b64_string = base64.b64encode(blob).decode("utf-8") md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" - # Tables if self._is_table(shape): - md_content += self._convert_table_to_markdown(shape.table, **kwargs) + md_content += self._convert_table_to_markdown( + shape.table, **kwargs) # Charts if shape.has_chart: @@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter): # Group Shapes if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: - sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) + sorted_shapes = sorted( + shape.shapes, key=attrgetter("top", "left")) for subshape in sorted_shapes: get_shape_content(subshape, **kwargs) @@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter): html_table += "" return ( - self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + self._html_converter.convert_string( + html_table, **kwargs).markdown.strip() + "\n" ) diff --git a/packages/markitup/src/markitup/converters/_xlsx_converter.py b/packages/markitup/src/markitup/converters/_xlsx_converter.py index 28f73a0..8769fe0 100644 --- a/packages/markitup/src/markitup/converters/_xlsx_converter.py +++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py @@ -3,7 +3,7 @@ from typing import BinaryIO, Any from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE -from .._stream_info import StreamInfo +from .._schemas import StreamInfo # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later