Merge pull request #2 from pathintegral-institute/rong/tech-139-modality-conversion

Rong/tech 139 modality conversion
2025-04-22 19:30:24 +08:00 · 2025-04-22 19:30:24 +08:00 · cd85971867
commit cd85971867
parent e729da2b38 e521dbcf2d
21 changed files with 257 additions and 470 deletions
--- a/packages/markitup/pyproject.toml
+++ b/packages/markitup/pyproject.toml
@ -32,8 +32,6 @@ dependencies = [
  "python-pptx",
  "mammoth",
  "pandas",
-  "openpyxl",
-  "xlrd",
  "lxml",
  "olefile",
  "pydub",
--- a/packages/markitup/src/markitup/init.py
+++ b/packages/markitup/src/markitup/init.py
@ -7,9 +7,8 @@ from ._markitup import (
    MarkItUp,
 )
 from ._base_converter import DocumentConverterResult, DocumentConverter
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config
 from ._exceptions import (
-    MarkItUpException,
    MissingDependencyException,
    FailedConversionAttempt,
    FileConversionException,
@ -21,10 +20,10 @@ __all__ = [
    "MarkItUp",
    "DocumentConverter",
    "DocumentConverterResult",
-    "MarkItUpException",
    "MissingDependencyException",
    "FailedConversionAttempt",
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
+    "Config"
 ]
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@ -2,8 +2,9 @@ import os
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List, Dict
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo
 import re
+import base64


 class DocumentConverterResult:
@ -11,9 +12,11 @@ class DocumentConverterResult:

    def __init__(
        self,
-        markdown: str,
+        markdown: str = "",
        *,
        title: Optional[str] = None,
+        audio_stream: Optional[BinaryIO] = None,
+        stream_info: Optional[StreamInfo] = None,
    ):
        """
        Initialize the DocumentConverterResult.
@ -26,7 +29,9 @@ class DocumentConverterResult:
        - title: Optional title of the document.
        """
        self.markdown = markdown
+        self.audio_stream = audio_stream
        self.title = title
+        self.stream_info = stream_info

    def to_llm(self) -> List[Dict[str, Any]]:
        """
@ -40,7 +45,6 @@ class DocumentConverterResult:
                                (text and images) in their original order.
        """

-
        # Pattern to match markdown image syntax with base64 data
        pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'

@ -80,7 +84,14 @@ class DocumentConverterResult:
                    "type": "text",
                    "text": text_chunk
                })
-
+        if self.audio_stream:
+            audio_b64 = base64.b64encode(
+                self.audio_stream.read()).decode('utf-8')
+            content.append({
+                "type": "media",
+                "mime_type": self.stream_info.magic_type,
+                "data": audio_b64
+            })
        return content

    @property
@ -105,7 +116,7 @@ class DocumentConverter:
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        ** kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        """
        Convert a document to Markdown text.
--- a/packages/markitup/src/markitup/_exceptions.py
+++ b/packages/markitup/src/markitup/_exceptions.py
@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential
 * etc."""


-class MarkItUpException(Exception):
-    """
-    Base exception class for MarkItUp.
-    """
-
-    pass
-
-
-class MissingDependencyException(MarkItUpException):
+class MissingDependencyException(Exception):
    """
    Converters shipped with MarkItUp may depend on optional
    dependencies. This exception is thrown when a converter's
@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException):
    pass


-class UnsupportedFormatException(MarkItUpException):
+class UnsupportedFormatException(Exception):
    """
    Thrown when no suitable converter was found for the given file.
    """
@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException):
    pass


-class FailedConversionAttempt(object):
+class FailedConversionAttempt(Exception):
    """
-    Represents an a single attempt to convert a file.
+    Represents a single attempt to convert a file.
    """

-    def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
-        self.converter = converter
-        self.exc_info = exc_info
+    def __init__(self):
+        super().__init__(f"Conversion attempt failed!")


-class FileConversionException(MarkItUpException):
+class FileConversionException(Exception):
    """
    Thrown when a suitable converter was found, but the conversion
    process fails for any reason.
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from warnings import warn
 import magic

-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config

 from .converters import (
    PlainTextConverter,
@ -14,7 +14,7 @@ from .converters import (
    XlsxConverter,
    XlsConverter,
    PptxConverter,
-    # AudioConverter,
+    AudioConverter,
    CsvConverter,
 )

@ -33,30 +33,42 @@ class MarkItUp:

    def __init__(
        self,
-        config: Optional[Dict[str, Any]] = None,
+        config: Config = Config(),
    ):
        self.config = config

    def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
        stream_info: StreamInfo = self._get_stream_info(stream)
        # Deal with unsupported file types
-        match stream_info.category:
-            case "ppt":
-                raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
-            case "other":
-                raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
-        
        try:
            match stream_info.category:
                case "text":
                    return PlainTextConverter().convert(stream, stream_info), stream_info
                case "pptx":
-                    return PptxConverter().convert(stream, stream_info), stream_info
+                    return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
                case "pdf":
-                    return PdfConverter().convert(stream, stream_info), stream_info
+                    return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "audio":
+                    return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "xlsx":
+                    return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "xls":
+                    return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "csv":
+                    return CsvConverter().convert(stream, stream_info), stream_info
+                case "docx":
+                    return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
+                case _:
+                    match stream_info.category:
+                        case "ppt":
+                            raise UnsupportedFormatException(
+                                ".ppt files are not supported, try .pptx instead")
+                        case "other":
+                            raise UnsupportedFormatException(
+                                f"{stream_info.magic_type} files are not supported")
        except FailedConversionAttempt:
-            raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
-        return stream_info
+            raise FileConversionException(
+                f"Failed to convert file of type {stream_info.magic_type}")

    def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
        original_position = byte_stream.tell()
@ -91,8 +103,13 @@ class MarkItUp:
            category = "docx"
        elif magic_type == "application/pdf":
            category = "pdf"
+        elif magic_type == "application/csv":
+            category = "csv"
        elif magic_type.startswith("text/"):
-            category = "text"
+            if magic_type == "text/csv":
+                category = "csv"
+            else:
+                category = "text"
        else:
            category = "other"

--- a/packages/markitup/src/markitup/_schemas.py
+++ b/packages/markitup/src/markitup/_schemas.py
@ -0,0 +1,15 @@
+from dataclasses import dataclass, asdict, field
+from typing import Optional, List, Literal
+
+
+@dataclass
+class StreamInfo:
+    magic_type: Optional[str] = None
+    category: Optional[str] = None
+
+
+@dataclass
+class Config:
+    modalities: List[Literal["image", "audio"]] = field(
+        default_factory=lambda: ["image", "audio"]
+    )
--- a/packages/markitup/src/markitup/_stream_info.py
+++ b/packages/markitup/src/markitup/_stream_info.py
@ -1,8 +0,0 @@
-from dataclasses import dataclass, asdict
-from typing import Optional
-
-
-@dataclass
-class StreamInfo:
-    magic_type: Optional[str] = None
-    category: Optional[str] = None
--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@ -1,7 +1,11 @@
 import os
 from io import BytesIO
-from markitup._stream_info import StreamInfo
+from markitup._schemas import StreamInfo
 import magic
+import speech_recognition as sr
+import pydub
+import io
+from typing import BinaryIO


 def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
@ -38,65 +42,23 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
    return byte_streams


-def detect_file_types(file_dict):
-    """
-    Detects file types for a dictionary of {filename: BytesIO} pairs
-    using only magic type (content-based detection)
+def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
+    audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None

-    Args:
-        file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
+    match audio_format:
+        case 'mp3':
+            audio_segment = pydub.AudioSegment.from_file(
+                file_stream, format=audio_format)
+            audio_source = io.BytesIO()
+            audio_segment.export(audio_source, format="wav")
+            audio_source.seek(0)
+        case 'wav':
+            audio_source = file_stream
+        case _:
+            raise ValueError(f"Unsupported audio format: {magic_type}")

-    Returns:
-        dict: Dictionary with filenames as keys and file type information as values
-    """
-    result = {}
-
-    for filename, byte_stream in file_dict.items():
-        # Get the original position to reset later
-        original_position = byte_stream.tell()
-
-        # Reset stream position to beginning
-        byte_stream.seek(0)
-
-        # Get file content for analysis
-        file_content = byte_stream.read()
-
-        # Use python-magic to determine file type based on content
-        magic_type = magic.from_buffer(file_content, mime=True)
-
-        # Determine file category based on magic_type
-        if magic_type.startswith("image/"):
-            category = "image"
-        elif magic_type.startswith("audio/"):
-            category = "audio"
-        elif magic_type.startswith("video/"):
-            category = "video"
-        elif (
-            magic_type.startswith("application/vnd.ms-excel")
-            or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-        ):
-            category = "xls"
-        elif (
-            magic_type.startswith("application/vnd.ms-powerpoint")
-            or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
-        ):
-            category = "ppt"
-        elif (
-            magic_type.startswith("application/msword")
-            or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ):
-            category = "doc"
-        elif magic_type == "application/pdf":
-            category = "pdf"
-        elif magic_type.startswith("text/"):
-            category = "text"
-        else:
-            category = "other"
-
-        # Store the results
-        result[filename] = StreamInfo(magic_type=magic_type, category=category)
-
-        # Reset stream position
-        byte_stream.seek(original_position)
-
-    return result
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(audio_source) as source:
+        audio = recognizer.record(source)
+        transcript = recognizer.recognize_google(audio).strip()
+        return "[No speech detected]" if transcript == "" else transcript
--- a/packages/markitup/src/markitup/converters/init.py
+++ b/packages/markitup/src/markitup/converters/init.py
@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
-# from ._audio_converter import AudioConverter
+from ._audio_converter import AudioConverter
 from ._csv_converter import CsvConverter
 from ._markdownify import _CustomMarkdownify

@ -27,7 +27,7 @@ __all__ = [
    "XlsConverter",
    "PptxConverter",
    "ImageConverter",
-    # "AudioConverter",
+    "AudioConverter",
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@ -1,23 +1,10 @@
 import io
-from typing import Any, BinaryIO, Optional
+from typing import Any, BinaryIO, Optional, Tuple

-from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo, Config
 from .._exceptions import MissingDependencyException
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "audio/x-wav",
-    "audio/mpeg",
-    "video/mp4",
-]
-
-ACCEPTED_FILE_EXTENSIONS = [
-    ".wav",
-    ".mp3",
-    ".m4a",
-    ".mp4",
-]
+from ..converter_utils.utils import transcribe_audio


 class AudioConverter(DocumentConverter):
@ -25,77 +12,25 @@ class AudioConverter(DocumentConverter):
    Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
    """

-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+    def __init__(self, config: Config):
+        self.config = config

    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> DocumentConverterResult:
+        ** kwargs: Any,  # Options to pass to the converter
+    ) -> Tuple[DocumentConverterResult, StreamInfo]:
        md_content = ""

-        # Add metadata
-        metadata = exiftool_metadata(
-            file_stream, exiftool_path=kwargs.get("exiftool_path")
-        )
-        if metadata:
-            for f in [
-                "Title",
-                "Artist",
-                "Author",
-                "Band",
-                "Album",
-                "Genre",
-                "Track",
-                "DateTimeOriginal",
-                "CreateDate",
-                # "Duration", -- Wrong values when read from memory
-                "NumChannels",
-                "SampleRate",
-                "AvgBytesPerSec",
-                "BitsPerSample",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Figure out the audio format for transcription
-        if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
-            audio_format = "wav"
-        elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
-            audio_format = "mp3"
-        elif (
-            stream_info.extension in [".mp4", ".m4a"]
-            or stream_info.mimetype == "video/mp4"
-        ):
-            audio_format = "mp4"
-        else:
-            audio_format = None
-
        # Transcribe
-        if audio_format:
-            try:
-                transcript = transcribe_audio(file_stream, audio_format=audio_format)
-                if transcript:
-                    md_content += "\n\n### Audio Transcript:\n" + transcript
-            except MissingDependencyException:
-                pass
+        if 'audio' not in self.config.modalities:
+            transcript = transcribe_audio(
+                file_stream, magic_type=stream_info.magic_type)
+            if transcript:
+                md_content += "\n\n### Audio Transcript:\n" + transcript
+            return DocumentConverterResult(markdown=md_content.strip())
+        else:
+            return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)

        # Return the result
-        return DocumentConverterResult(markdown=md_content.strip())
--- a/packages/markitup/src/markitup/converters/_csv_converter.py
+++ b/packages/markitup/src/markitup/converters/_csv_converter.py
@ -3,15 +3,8 @@ import csv
 import io
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
-from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "text/csv",
-    "application/csv",
-]
-ACCEPTED_FILE_EXTENSIONS = [".csv"]
+from .._schemas import StreamInfo


 class CsvConverter(DocumentConverter):
@ -19,24 +12,6 @@ class CsvConverter(DocumentConverter):
    Converts CSV files to Markdown tables.
    """

-    def __init__(self):
-        super().__init__()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-        return False
-
    def convert(
        self,
        file_stream: BinaryIO,
@ -44,10 +19,7 @@ class CsvConverter(DocumentConverter):
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Read the file content
-        if stream_info.charset:
-            content = file_stream.read().decode(stream_info.charset)
-        else:
-            content = str(from_bytes(file_stream.read()).best())
+        content = str(from_bytes(file_stream.read()).best())

        # Parse CSV content
        reader = csv.reader(io.StringIO(content))
--- a/packages/markitup/src/markitup/converters/_docx_converter.py
+++ b/packages/markitup/src/markitup/converters/_docx_converter.py
@ -5,24 +5,8 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
-_dependency_exc_info = None
-try:
-    import mammoth
-except ImportError:
-    # Preserve the error and stack trace for later
-    _dependency_exc_info = sys.exc_info()
-
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-]
-
-ACCEPTED_FILE_EXTENSIONS = [".docx"]
+from .._schemas import StreamInfo, Config
+import mammoth


 class DocxConverter(HtmlConverter):
@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter):
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)

    def convert(
        self,
@ -58,23 +23,11 @@ class DocxConverter(HtmlConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        # Check: the dependencies
-        if _dependency_exc_info is not None:
-            raise MissingDependencyException(
-                MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".docx",
-                    feature="docx",
-                )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _dependency_exc_info[2]
-            )

        style_map = kwargs.get("style_map", None)
        pre_process_stream = pre_process_docx(file_stream)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+            mammoth.convert_to_html(
+                pre_process_stream, style_map=style_map).value,
            **kwargs,
        )
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo, Config
 from ._markdownify import _CustomMarkdownify

 ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,10 @@ ACCEPTED_FILE_CATEGORY = [

 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
+
+    def __init__(self, config: Config):
+        self.config = config
+
    def convert(
        self,
        file_stream: BinaryIO,
@ -27,7 +31,8 @@ class HtmlConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8"
-        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser",
+                             from_encoding=encoding)

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
@ -37,15 +42,17 @@ class HtmlConverter(DocumentConverter):
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify(
+                config=self.config, **kwargs).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+            webpage_text = _CustomMarkdownify(
+                config=self.config, **kwargs).convert_soup(soup)

        assert isinstance(webpage_text, str)

        # remove leading and trailing \n
        webpage_text = webpage_text.strip()
-
+        print(webpage_text)
        return DocumentConverterResult(
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
--- a/packages/markitup/src/markitup/converters/_markdownify.py
+++ b/packages/markitup/src/markitup/converters/_markdownify.py
@ -3,6 +3,7 @@ import markdownify

 from typing import Any, Optional
 from urllib.parse import quote, unquote, urlparse, urlunparse
+from .._schemas import Config


 class _CustomMarkdownify(markdownify.MarkdownConverter):
@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
    """

-    def __init__(self, **options: Any):
-        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+    def __init__(self, config: Config, **options: Any):
+        options["heading_style"] = options.get(
+            "heading_style", markdownify.ATX)
        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
+        self.config = config

    def convert_hn(
        self,
@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        if href:
            try:
                parsed_url = urlparse(href)  # type: ignore
-                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:
                    return "%s%s%s" % (prefix, text, suffix)
-                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+                href = urlunparse(parsed_url._replace(
+                    path=quote(unquote(parsed_url.path))))  # type: ignore
            except ValueError:  # It's not clear if this ever gets thrown
                return "%s%s%s" % (prefix, text, suffix)

@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        src = el.attrs.get("src", None) or ""
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        if (
-            convert_as_inline
-            and el.parent.name not in self.options["keep_inline_images_in"]
-        ):
+
+        if "image" in self.config.modalities:
+            return "![%s](%s%s)" % (alt, src, title_part)
+        else:
            return alt

-        # Remove dataURIs
-        if src.startswith("data:") and not self.options["keep_data_uris"]:
-            src = src.split(",")[0] + "..."
-
-        return "![%s](%s%s)" % (alt, src, title_part)
-
    def convert_soup(self, soup: Any) -> str:
        return super().convert_soup(soup)  # type: ignore
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -3,7 +3,7 @@ import io
 import base64

 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo, Config

 import fitz

@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter):
    Converts PDFs to Markdown with embedded images.
    """

+    def __init__(self, config: Config):
+        self.config = config
+
    def convert(
        self,
        file_stream: BinaryIO,
@ -36,27 +39,28 @@ class PdfConverter(DocumentConverter):

            # Extract images from the page
            image_list = page.get_images(full=True)
+            if 'image' in self.config.modalities:
+                for img_index, img_info in enumerate(image_list):
+                    xref = img_info[0]  # Get the image reference
+                    base_image = doc.extract_image(xref)

-            for img_index, img_info in enumerate(image_list):
-                xref = img_info[0]  # Get the image reference
-                base_image = doc.extract_image(xref)
-                
-                if base_image:
-                    image_bytes = base_image["image"]
-                    image_ext = base_image["ext"]
-                    
-                    try:
-                        # Convert image to base64 for markdown embedding
-                        img_base64 = base64.b64encode(image_bytes).decode('utf-8')
-                        # Add image to markdown with a unique identifier
-                        image_count += 1
-                        markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
-                    except Exception as e:
-                        markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
+                    if base_image:
+                        image_bytes = base_image["image"]
+                        image_ext = base_image["ext"]

+                        try:
+                            # Convert image to base64 for markdown embedding
+                            img_base64 = base64.b64encode(
+                                image_bytes).decode('utf-8')
+                            # Add image to markdown with a unique identifier
+                            image_count += 1
+                            markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
+                        except Exception as e:
+                            markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
+            else:
+                markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
        # Close the document to free resources
        doc.close()
-        print(markdown_content)
        return DocumentConverterResult(
            markdown=markdown_content,
        )
--- a/packages/markitup/src/markitup/converters/_plain_text_converter.py
+++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py
@ -1,11 +1,12 @@
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo


 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
+
    def convert(
        self,
        file_stream: BinaryIO,
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -10,7 +10,7 @@ from operator import attrgetter

 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo, Config
 import pptx


@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter):
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)
+        self.config = config

    def convert(
        self,
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):

                    # Also grab any description embedded in the deck
                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                            "descr", "")
                    except Exception:
                        # Unable to get alt text
                        pass
@ -69,16 +70,20 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()

                    # If keep_data_uris is True, use base64 encoding for images
-
-                    blob = shape.image.blob
-                    content_type = shape.image.content_type or "image/png"
-                    b64_string = base64.b64encode(blob).decode("utf-8")
-                    md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
-
+                    if 'image' in self.config.modalities:
+                        blob = shape.image.blob
+                        content_type = shape.image.content_type or "image/png"
+                        b64_string = base64.b64encode(blob).decode("utf-8")
+                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
+                    else:
+                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                        md_content += "\n![" + alt_text + \
+                            "](" + filename + ")\n"

                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
+                    md_content += self._convert_table_to_markdown(
+                        shape.table, **kwargs)

                # Charts
                if shape.has_chart:
@ -93,7 +98,8 @@ class PptxConverter(DocumentConverter):

                # Group Shapes
                if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
-                    sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
+                    sorted_shapes = sorted(
+                        shape.shapes, key=attrgetter("top", "left"))
                    for subshape in sorted_shapes:
                        get_shape_content(subshape, **kwargs)

@ -141,7 +147,8 @@ class PptxConverter(DocumentConverter):
        html_table += "</table></body></html>"

        return (
-            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+            self._html_converter.convert_string(
+                html_table, **kwargs).markdown.strip()
            + "\n"
        )

--- a/packages/markitup/src/markitup/converters/_xlsx_converter.py
+++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py
@ -1,36 +1,8 @@
-import sys
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-from .._stream_info import StreamInfo
-
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
-_xlsx_dependency_exc_info = None
-try:
-    import pandas as pd
-    import openpyxl
-except ImportError:
-    _xlsx_dependency_exc_info = sys.exc_info()
-
-_xls_dependency_exc_info = None
-try:
-    import pandas as pd
-    import xlrd
-except ImportError:
-    _xls_dependency_exc_info = sys.exc_info()
-
-ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-]
-ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
-
-ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
-    "application/vnd.ms-excel",
-    "application/excel",
-]
-ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
+from .._schemas import StreamInfo, Config
+import pandas as pd


 class XlsxConverter(DocumentConverter):
@ -38,27 +10,8 @@ class XlsxConverter(DocumentConverter):
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)

    def convert(
        self,
@ -66,20 +19,6 @@ class XlsxConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        # Check the dependencies
-        if _xlsx_dependency_exc_info is not None:
-            raise MissingDependencyException(
-                MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".xlsx",
-                    feature="xlsx",
-                )
-            ) from _xlsx_dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _xlsx_dependency_exc_info[2]
-            )
-
        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
        md_content = ""
        for s in sheets:
@ -100,27 +39,8 @@ class XlsConverter(DocumentConverter):
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)

    def convert(
        self,
@ -128,19 +48,6 @@ class XlsConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        # Load the dependencies
-        if _xls_dependency_exc_info is not None:
-            raise MissingDependencyException(
-                MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".xls",
-                    feature="xls",
-                )
-            ) from _xls_dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _xls_dependency_exc_info[2]
-            )

        sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
        md_content = ""
--- a/packages/markitup/tests/test_files/test.csv
+++ b/packages/markitup/tests/test_files/test.csv
@ -0,0 +1,51 @@
+ID,Name,Age,Country,Email
+1,Name_1,62,Country_1,email_1@example.com
+2,Name_2,48,Country_2,email_2@example.com
+3,Name_3,61,Country_3,email_3@example.com
+4,Name_4,32,Country_4,email_4@example.com
+5,Name_5,69,Country_5,email_5@example.com
+6,Name_6,32,Country_6,email_6@example.com
+7,Name_7,62,Country_7,email_7@example.com
+8,Name_8,39,Country_8,email_8@example.com
+9,Name_9,40,Country_9,email_9@example.com
+10,Name_10,32,Country_0,email_10@example.com
+11,Name_11,24,Country_1,email_11@example.com
+12,Name_12,45,Country_2,email_12@example.com
+13,Name_13,39,Country_3,email_13@example.com
+14,Name_14,18,Country_4,email_14@example.com
+15,Name_15,66,Country_5,email_15@example.com
+16,Name_16,48,Country_6,email_16@example.com
+17,Name_17,60,Country_7,email_17@example.com
+18,Name_18,31,Country_8,email_18@example.com
+19,Name_19,43,Country_9,email_19@example.com
+20,Name_20,33,Country_0,email_20@example.com
+21,Name_21,32,Country_1,email_21@example.com
+22,Name_22,68,Country_2,email_22@example.com
+23,Name_23,44,Country_3,email_23@example.com
+24,Name_24,32,Country_4,email_24@example.com
+25,Name_25,33,Country_5,email_25@example.com
+26,Name_26,46,Country_6,email_26@example.com
+27,Name_27,38,Country_7,email_27@example.com
+28,Name_28,50,Country_8,email_28@example.com
+29,Name_29,68,Country_9,email_29@example.com
+30,Name_30,66,Country_0,email_30@example.com
+31,Name_31,60,Country_1,email_31@example.com
+32,Name_32,53,Country_2,email_32@example.com
+33,Name_33,30,Country_3,email_33@example.com
+34,Name_34,30,Country_4,email_34@example.com
+35,Name_35,43,Country_5,email_35@example.com
+36,Name_36,44,Country_6,email_36@example.com
+37,Name_37,31,Country_7,email_37@example.com
+38,Name_38,35,Country_8,email_38@example.com
+39,Name_39,56,Country_9,email_39@example.com
+40,Name_40,35,Country_0,email_40@example.com
+41,Name_41,62,Country_1,email_41@example.com
+42,Name_42,63,Country_2,email_42@example.com
+43,Name_43,51,Country_3,email_43@example.com
+44,Name_44,52,Country_4,email_44@example.com
+45,Name_45,66,Country_5,email_45@example.com
+46,Name_46,69,Country_6,email_46@example.com
+47,Name_47,68,Country_7,email_47@example.com
+48,Name_48,68,Country_8,email_48@example.com
+49,Name_49,69,Country_9,email_49@example.com
+50,Name_50,46,Country_0,email_50@example.com
--- a/packages/markitup/tests/test_files/test.docx
+++ b/packages/markitup/tests/test_files/test.docx
--- a/packages/markitup/uv.lock
+++ b/packages/markitup/uv.lock
@ -173,15 +173,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
 ]

-[[package]]
-name = "et-xmlfile"
-version = "2.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
-]
-
 [[package]]
 name = "flatbuffers"
 version = "25.2.10"
@ -348,7 +339,6 @@ dependencies = [
    { name = "mammoth" },
    { name = "markdownify" },
    { name = "olefile" },
-    { name = "openpyxl" },
    { name = "pandas" },
    { name = "pydub" },
    { name = "pymupdf" },
@ -356,7 +346,6 @@ dependencies = [
    { name = "python-pptx" },
    { name = "requests" },
    { name = "speechrecognition" },
-    { name = "xlrd" },
 ]

 [package.metadata]
@ -368,7 +357,6 @@ requires-dist = [
    { name = "mammoth" },
    { name = "markdownify" },
    { name = "olefile" },
-    { name = "openpyxl" },
    { name = "pandas" },
    { name = "pydub" },
    { name = "pymupdf", specifier = ">=1.25.5" },
@ -376,7 +364,6 @@ requires-dist = [
    { name = "python-pptx" },
    { name = "requests" },
    { name = "speechrecognition" },
-    { name = "xlrd" },
 ]

 [[package]]
@ -492,18 +479,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
 ]

-[[package]]
-name = "openpyxl"
-version = "3.1.5"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "et-xmlfile" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
-]
-
 [[package]]
 name = "packaging"
 version = "25.0"
@ -847,15 +822,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
 ]

-[[package]]
-name = "xlrd"
-version = "2.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f908304375bd43f5ed7a4c28a370dc9122c565423e6b44/xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88", size = 100259 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 },
-]
-
 [[package]]
 name = "xlsxwriter"
 version = "3.2.3"