From 03f3fa98290f6c67828de85cc642a01fcf572b21 Mon Sep 17 00:00:00 2001
From: rong-xyz <rong@pathintegral.xyz>
Date: Tue, 22 Apr 2025 07:00:30 +0000
Subject: [PATCH] modality

---
 packages/markitup/src/markitup/__init__.py    |  3 ++-
 .../markitup/src/markitup/_base_converter.py  |  9 ++++----
 packages/markitup/src/markitup/_markitup.py   | 17 ++++++++-------
 packages/markitup/src/markitup/_schemas.py    | 15 +++++++++++++
 .../markitup/src/markitup/_stream_info.py     |  8 -------
 .../src/markitup/converter_utils/utils.py     |  2 +-
 .../markitup/converters/_audio_converter.py   |  5 +++--
 .../src/markitup/converters/_csv_converter.py |  2 +-
 .../markitup/converters/_docx_converter.py    |  5 +++--
 .../markitup/converters/_html_converter.py    |  6 ++++--
 .../src/markitup/converters/_pdf_converter.py | 21 ++++++++++---------
 .../converters/_plain_text_converter.py       |  3 ++-
 .../markitup/converters/_pptx_converter.py    | 15 +++++++------
 .../markitup/converters/_xlsx_converter.py    |  2 +-
 14 files changed, 66 insertions(+), 47 deletions(-)
 create mode 100644 packages/markitup/src/markitup/_schemas.py
 delete mode 100644 packages/markitup/src/markitup/_stream_info.py

diff --git a/packages/markitup/src/markitup/__init__.py b/packages/markitup/src/markitup/__init__.py
index aef329a..ba22cbc 100644
--- a/packages/markitup/src/markitup/__init__.py
+++ b/packages/markitup/src/markitup/__init__.py
@@ -7,7 +7,7 @@ from ._markitup import (
     MarkItUp,
 )
 from ._base_converter import DocumentConverterResult, DocumentConverter
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config
 from ._exceptions import (
     MarkItUpException,
     MissingDependencyException,
@@ -27,4 +27,5 @@ __all__ = [
     "FileConversionException",
     "UnsupportedFormatException",
     "StreamInfo",
+    "Config"
 ]
diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py
index cdabef9..9de88df 100644
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@@ -2,7 +2,7 @@ import os
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List, Dict
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo
 import re
 
 
@@ -27,19 +27,18 @@ class DocumentConverterResult:
         """
         self.markdown = markdown
         self.title = title
-    
+
     def to_llm(self) -> List[Dict[str, Any]]:
         """
         Convert markdown with base64 images to a format compatible with OpenAI's API.
-        
+
         This function parses the markdown content, extracting text and images in their
         original order, and returns a list of content elements in OpenAI's format.
-        
+
         Returns:
             List[Dict[str, Any]]: A list of dictionaries representing the content elements
                                 (text and images) in their original order.
         """
-        
 
         # Pattern to match markdown image syntax with base64 data
         pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py
index c2fb0a2..0b5ddf0 100644
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from warnings import warn
 import magic
 
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config
 
 from .converters import (
     PlainTextConverter,
@@ -33,7 +33,7 @@ class MarkItUp:
 
     def __init__(
         self,
-        config: Optional[Dict[str, Any]] = None,
+        config: Config = Config(),
     ):
         self.config = config
 
@@ -42,10 +42,12 @@ class MarkItUp:
         # Deal with unsupported file types
         match stream_info.category:
             case "ppt":
-                raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
+                raise UnsupportedFormatException(
+                    ".ppt files are not supported, try .pptx instead")
             case "other":
-                raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
-        
+                raise UnsupportedFormatException(
+                    f"{stream_info.magic_type} files are not supported")
+
         try:
             match stream_info.category:
                 case "text":
@@ -55,7 +57,8 @@ class MarkItUp:
                 case "pdf":
                     return PdfConverter().convert(stream, stream_info), stream_info
         except FailedConversionAttempt:
-            raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
+            raise FileConversionException(
+                f"Failed to convert file of type {stream_info.magic_type}")
         return stream_info
 
     def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
@@ -97,4 +100,4 @@ class MarkItUp:
             category = "other"
 
         byte_stream.seek(original_position)
-        return StreamInfo(magic_type=magic_type, category=category)
\ No newline at end of file
+        return StreamInfo(magic_type=magic_type, category=category)
diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py
new file mode 100644
index 0000000..ecfce92
--- /dev/null
+++ b/packages/markitup/src/markitup/_schemas.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass, asdict, field
+from typing import Optional, List, Literal
+
+
+@dataclass
+class StreamInfo:
+    magic_type: Optional[str] = None
+    category: Optional[str] = None
+
+
+@dataclass
+class Config:
+    modality: List[Literal["image", "audio"]] = field(
+        default_factory=lambda: ["image", "audio"]
+    )
diff --git a/packages/markitup/src/markitup/_stream_info.py b/packages/markitup/src/markitup/_stream_info.py
deleted file mode 100644
index 66e8c72..0000000
--- a/packages/markitup/src/markitup/_stream_info.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dataclasses import dataclass, asdict
-from typing import Optional
-
-
-@dataclass
-class StreamInfo:
-    magic_type: Optional[str] = None
-    category: Optional[str] = None
\ No newline at end of file
diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py
index 8d5df3d..12e533d 100644
--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@@ -1,6 +1,6 @@
 import os
 from io import BytesIO
-from markitup._stream_info import StreamInfo
+from markitup._schemas import StreamInfo
 import magic
 
 
diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py
index 57a828d..eeff58e 100644
--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 
 from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from .._exceptions import MissingDependencyException
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
         # Transcribe
         if audio_format:
             try:
-                transcript = transcribe_audio(file_stream, audio_format=audio_format)
+                transcript = transcribe_audio(
+                    file_stream, audio_format=audio_format)
                 if transcript:
                     md_content += "\n\n### Audio Transcript:\n" + transcript
             except MissingDependencyException:
diff --git a/packages/markitup/src/markitup/converters/_csv_converter.py b/packages/markitup/src/markitup/converters/_csv_converter.py
index 7162889..78963ed 100644
--- a/packages/markitup/src/markitup/converters/_csv_converter.py
+++ b/packages/markitup/src/markitup/converters/_csv_converter.py
@@ -5,7 +5,7 @@ from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
     "text/csv",
diff --git a/packages/markitup/src/markitup/converters/_docx_converter.py b/packages/markitup/src/markitup/converters/_docx_converter.py
index b320695..0db97a9 100644
--- a/packages/markitup/src/markitup/converters/_docx_converter.py
+++ b/packages/markitup/src/markitup/converters/_docx_converter.py
@@ -5,7 +5,7 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
 # Try loading optional (but in this case, required) dependencies
@@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
         style_map = kwargs.get("style_map", None)
         pre_process_stream = pre_process_docx(file_stream)
         return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+            mammoth.convert_to_html(
+                pre_process_stream, style_map=style_map).value,
             **kwargs,
         )
diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py
index b85a68d..91db39a 100644
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from ._markdownify import _CustomMarkdownify
 
 ACCEPTED_MAGIC_TYPE_PREFIXES = [
@@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
 
 class HtmlConverter(DocumentConverter):
     """Anything with content type text/html"""
+
     def convert(
         self,
         file_stream: BinaryIO,
@@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
     ) -> DocumentConverterResult:
         # Parse the stream
         encoding = "utf-8"
-        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser",
+                             from_encoding=encoding)
 
         # Remove javascript and style blocks
         for script in soup(["script", "style"]):
diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py
index 3f64f42..0794c8a 100644
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@@ -3,7 +3,7 @@ import io
 import base64
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 
 import fitz
 
@@ -21,42 +21,43 @@ class PdfConverter(DocumentConverter):
     ) -> DocumentConverterResult:
         # Create a document object from the stream
         doc = fitz.open(stream=file_stream, filetype="pdf")
-        
+
         # Extract text and images from all pages
         markdown_content = ""
         image_count = 0
         for page_num in range(len(doc)):
             page = doc.load_page(page_num)
-            
+
             # Get text with the default "text" mode which gives plain text
             page_text = page.get_text("text")
             # Add page marker
             markdown_content += f"\n\n## Page {page_num + 1}\n\n"
             markdown_content += page_text + "\n\n"
-            
+
             # Extract images from the page
             image_list = page.get_images(full=True)
-            
+
             for img_index, img_info in enumerate(image_list):
                 xref = img_info[0]  # Get the image reference
                 base_image = doc.extract_image(xref)
-                
+
                 if base_image:
                     image_bytes = base_image["image"]
                     image_ext = base_image["ext"]
-                    
+
                     try:
                         # Convert image to base64 for markdown embedding
-                        img_base64 = base64.b64encode(image_bytes).decode('utf-8')
+                        img_base64 = base64.b64encode(
+                            image_bytes).decode('utf-8')
                         # Add image to markdown with a unique identifier
                         image_count += 1
                         markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
                     except Exception as e:
                         markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
-        
+
         # Close the document to free resources
         doc.close()
         print(markdown_content)
         return DocumentConverterResult(
             markdown=markdown_content,
-        )
\ No newline at end of file
+        )
diff --git a/packages/markitup/src/markitup/converters/_plain_text_converter.py b/packages/markitup/src/markitup/converters/_plain_text_converter.py
index b7f776e..740a4f7 100644
--- a/packages/markitup/src/markitup/converters/_plain_text_converter.py
+++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py
@@ -1,11 +1,12 @@
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 
 
 class PlainTextConverter(DocumentConverter):
     """Anything with content type text/plain"""
+
     def convert(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py
index f1c112b..3ee4595 100644
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@@ -10,7 +10,7 @@ from operator import attrgetter
 
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 import pptx
 
 
@@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
 
                     # Also grab any description embedded in the deck
                     try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                            "descr", "")
                     except Exception:
                         # Unable to get alt text
                         pass
@@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
                     b64_string = base64.b64encode(blob).decode("utf-8")
                     md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
 
-
                 # Tables
                 if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
+                    md_content += self._convert_table_to_markdown(
+                        shape.table, **kwargs)
 
                 # Charts
                 if shape.has_chart:
@@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
 
                 # Group Shapes
                 if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
-                    sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
+                    sorted_shapes = sorted(
+                        shape.shapes, key=attrgetter("top", "left"))
                     for subshape in sorted_shapes:
                         get_shape_content(subshape, **kwargs)
 
@@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
         html_table += "</table></body></html>"
 
         return (
-            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+            self._html_converter.convert_string(
+                html_table, **kwargs).markdown.strip()
             + "\n"
         )
 
diff --git a/packages/markitup/src/markitup/converters/_xlsx_converter.py b/packages/markitup/src/markitup/converters/_xlsx_converter.py
index 28f73a0..8769fe0 100644
--- a/packages/markitup/src/markitup/converters/_xlsx_converter.py
+++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py
@@ -3,7 +3,7 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later