modality

2025-04-22 07:00:30 +00:00 · 2025-04-22 07:00:30 +00:00 · 03f3fa9829
commit 03f3fa9829
parent e729da2b38
14 changed files with 66 additions and 47 deletions
--- a/packages/markitup/src/markitup/init.py
+++ b/packages/markitup/src/markitup/init.py
@ -7,7 +7,7 @@ from ._markitup import (
    MarkItUp,
 )
 from ._base_converter import DocumentConverterResult, DocumentConverter
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config
 from ._exceptions import (
    MarkItUpException,
    MissingDependencyException,
@ -27,4 +27,5 @@ __all__ = [
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
+    "Config"
 ]
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@ -2,7 +2,7 @@ import os
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List, Dict
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo
 import re


@ -27,19 +27,18 @@ class DocumentConverterResult:
        """
        self.markdown = markdown
        self.title = title
-    
+
    def to_llm(self) -> List[Dict[str, Any]]:
        """
        Convert markdown with base64 images to a format compatible with OpenAI's API.
-        
+
        This function parses the markdown content, extracting text and images in their
        original order, and returns a list of content elements in OpenAI's format.
-        
+
        Returns:
            List[Dict[str, Any]]: A list of dictionaries representing the content elements
                                (text and images) in their original order.
        """
-        

        # Pattern to match markdown image syntax with base64 data
        pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from warnings import warn
 import magic

-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config

 from .converters import (
    PlainTextConverter,
@ -33,7 +33,7 @@ class MarkItUp:

    def __init__(
        self,
-        config: Optional[Dict[str, Any]] = None,
+        config: Config = Config(),
    ):
        self.config = config

@ -42,10 +42,12 @@ class MarkItUp:
        # Deal with unsupported file types
        match stream_info.category:
            case "ppt":
-                raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
+                raise UnsupportedFormatException(
+                    ".ppt files are not supported, try .pptx instead")
            case "other":
-                raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
-        
+                raise UnsupportedFormatException(
+                    f"{stream_info.magic_type} files are not supported")
+
        try:
            match stream_info.category:
                case "text":
@ -55,7 +57,8 @@ class MarkItUp:
                case "pdf":
                    return PdfConverter().convert(stream, stream_info), stream_info
        except FailedConversionAttempt:
-            raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
+            raise FileConversionException(
+                f"Failed to convert file of type {stream_info.magic_type}")
        return stream_info

    def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
@ -97,4 +100,4 @@ class MarkItUp:
            category = "other"

        byte_stream.seek(original_position)
-        return StreamInfo(magic_type=magic_type, category=category)
+        return StreamInfo(magic_type=magic_type, category=category)
--- a/packages/markitup/src/markitup/_schemas.py
+++ b/packages/markitup/src/markitup/_schemas.py
@ -0,0 +1,15 @@
+from dataclasses import dataclass, asdict, field
+from typing import Optional, List, Literal
+
+
+@dataclass
+class StreamInfo:
+    magic_type: Optional[str] = None
+    category: Optional[str] = None
+
+
+@dataclass
+class Config:
+    modality: List[Literal["image", "audio"]] = field(
+        default_factory=lambda: ["image", "audio"]
+    )
--- a/packages/markitup/src/markitup/_stream_info.py
+++ b/packages/markitup/src/markitup/_stream_info.py
@ -1,8 +0,0 @@
-from dataclasses import dataclass, asdict
-from typing import Optional
-
-
-@dataclass
-class StreamInfo:
-    magic_type: Optional[str] = None
-    category: Optional[str] = None
--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@ -1,6 +1,6 @@
 import os
 from io import BytesIO
-from markitup._stream_info import StreamInfo
+from markitup._schemas import StreamInfo
 import magic


--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional

 from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from .._exceptions import MissingDependencyException

 ACCEPTED_MIME_TYPE_PREFIXES = [
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
        # Transcribe
        if audio_format:
            try:
-                transcript = transcribe_audio(file_stream, audio_format=audio_format)
+                transcript = transcribe_audio(
+                    file_stream, audio_format=audio_format)
                if transcript:
                    md_content += "\n\n### Audio Transcript:\n" + transcript
            except MissingDependencyException:
--- a/packages/markitup/src/markitup/converters/_csv_converter.py
+++ b/packages/markitup/src/markitup/converters/_csv_converter.py
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo

 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/csv",
--- a/packages/markitup/src/markitup/converters/_docx_converter.py
+++ b/packages/markitup/src/markitup/converters/_docx_converter.py
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
        style_map = kwargs.get("style_map", None)
        pre_process_stream = pre_process_docx(file_stream)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+            mammoth.convert_to_html(
+                pre_process_stream, style_map=style_map).value,
            **kwargs,
        )
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from ._markdownify import _CustomMarkdownify

 ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [

 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
+
    def convert(
        self,
        file_stream: BinaryIO,
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8"
-        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser",
+                             from_encoding=encoding)

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -3,7 +3,7 @@ import io
 import base64

 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo

 import fitz

@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Create a document object from the stream
        doc = fitz.open(stream=file_stream, filetype="pdf")
-        
+
        # Extract text and images from all pages
        markdown_content = ""
        image_count = 0
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
-            
+
            # Get text with the default "text" mode which gives plain text
            page_text = page.get_text("text")
            # Add page marker
            markdown_content += f"\n\n## Page {page_num + 1}\n\n"
            markdown_content += page_text + "\n\n"
-            
+
            # Extract images from the page
            image_list = page.get_images(full=True)
-            
+
            for img_index, img_info in enumerate(image_list):
                xref = img_info[0]  # Get the image reference
                base_image = doc.extract_image(xref)
-                
+
                if base_image:
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
-                    
+
                    try:
                        # Convert image to base64 for markdown embedding
-                        img_base64 = base64.b64encode(image_bytes).decode('utf-8')
+                        img_base64 = base64.b64encode(
+                            image_bytes).decode('utf-8')
                        # Add image to markdown with a unique identifier
                        image_count += 1
                        markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
                    except Exception as e:
                        markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
-        
+
        # Close the document to free resources
        doc.close()
        print(markdown_content)
        return DocumentConverterResult(
            markdown=markdown_content,
-        )
+        )
--- a/packages/markitup/src/markitup/converters/_plain_text_converter.py
+++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py
@ -1,11 +1,12 @@
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo


 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
+
    def convert(
        self,
        file_stream: BinaryIO,
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -10,7 +10,7 @@ from operator import attrgetter

 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 import pptx


@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):

                    # Also grab any description embedded in the deck
                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                            "descr", "")
                    except Exception:
                        # Unable to get alt text
                        pass
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
                    b64_string = base64.b64encode(blob).decode("utf-8")
                    md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"

-
                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
+                    md_content += self._convert_table_to_markdown(
+                        shape.table, **kwargs)

                # Charts
                if shape.has_chart:
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):

                # Group Shapes
                if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
-                    sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
+                    sorted_shapes = sorted(
+                        shape.shapes, key=attrgetter("top", "left"))
                    for subshape in sorted_shapes:
                        get_shape_content(subshape, **kwargs)

@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
        html_table += "</table></body></html>"

        return (
-            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+            self._html_converter.convert_string(
+                html_table, **kwargs).markdown.strip()
            + "\n"
        )

--- a/packages/markitup/src/markitup/converters/_xlsx_converter.py
+++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py
@ -3,7 +3,7 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo

 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later