diff --git a/packages/markitup/src/markitup/__init__.py b/packages/markitup/src/markitup/__init__.py
index ba22cbc..b06156d 100644
--- a/packages/markitup/src/markitup/__init__.py
+++ b/packages/markitup/src/markitup/__init__.py
@@ -9,7 +9,6 @@ from ._markitup import (
 from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._schemas import StreamInfo, Config
 from ._exceptions import (
-    MarkItUpException,
     MissingDependencyException,
     FailedConversionAttempt,
     FileConversionException,
@@ -21,7 +20,6 @@ __all__ = [
     "MarkItUp",
     "DocumentConverter",
     "DocumentConverterResult",
-    "MarkItUpException",
     "MissingDependencyException",
     "FailedConversionAttempt",
     "FileConversionException",
diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py
index 9de88df..0d5300d 100644
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@@ -4,6 +4,7 @@ from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List, Dict
 from ._schemas import StreamInfo
 import re
+import base64
 
 
 class DocumentConverterResult:
@@ -11,9 +12,11 @@ class DocumentConverterResult:
 
     def __init__(
         self,
-        markdown: str,
+        markdown: str = "",
         *,
         title: Optional[str] = None,
+        audio_stream: Optional[BinaryIO] = None,
+        stream_info: Optional[StreamInfo] = None,
     ):
         """
         Initialize the DocumentConverterResult.
@@ -26,7 +29,9 @@ class DocumentConverterResult:
         - title: Optional title of the document.
         """
         self.markdown = markdown
+        self.audio_stream = audio_stream
         self.title = title
+        self.stream_info = stream_info
 
     def to_llm(self) -> List[Dict[str, Any]]:
         """
@@ -79,7 +84,14 @@ class DocumentConverterResult:
                     "type": "text",
                     "text": text_chunk
                 })
-
+        if self.audio_stream:
+            audio_b64 = base64.b64encode(
+                self.audio_stream.read()).decode('utf-8')
+            content.append({
+                "type": "media",
+                "mime_type": self.stream_info.magic_type,
+                "data": audio_b64
+            })
         return content
 
     @property
@@ -104,7 +116,7 @@ class DocumentConverter:
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        ** kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
         """
         Convert a document to Markdown text.
diff --git a/packages/markitup/src/markitup/_exceptions.py b/packages/markitup/src/markitup/_exceptions.py
index fca098f..b3a478e 100644
--- a/packages/markitup/src/markitup/_exceptions.py
+++ b/packages/markitup/src/markitup/_exceptions.py
@@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential
 * etc."""
 
 
-class MarkItUpException(Exception):
-    """
-    Base exception class for MarkItUp.
-    """
-
-    pass
-
-
-class MissingDependencyException(MarkItUpException):
+class MissingDependencyException(Exception):
     """
     Converters shipped with MarkItUp may depend on optional
     dependencies. This exception is thrown when a converter's
@@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException):
     pass
 
 
-class UnsupportedFormatException(MarkItUpException):
+class UnsupportedFormatException(Exception):
     """
     Thrown when no suitable converter was found for the given file.
     """
@@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException):
     pass
 
 
-class FailedConversionAttempt(object):
+class FailedConversionAttempt(Exception):
     """
-    Represents an a single attempt to convert a file.
+    Represents a single attempt to convert a file.
     """
 
-    def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
-        self.converter = converter
-        self.exc_info = exc_info
+    def __init__(self):
+        super().__init__(f"Conversion attempt failed!")
 
 
-class FileConversionException(MarkItUpException):
+class FileConversionException(Exception):
     """
     Thrown when a suitable converter was found, but the conversion
     process fails for any reason.
diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py
index 0b5ddf0..2cfb67a 100644
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@@ -14,7 +14,7 @@ from .converters import (
     XlsxConverter,
     XlsConverter,
     PptxConverter,
-    # AudioConverter,
+    AudioConverter,
     CsvConverter,
 )
 
@@ -53,9 +53,11 @@ class MarkItUp:
                 case "text":
                     return PlainTextConverter().convert(stream, stream_info), stream_info
                 case "pptx":
-                    return PptxConverter().convert(stream, stream_info), stream_info
+                    return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
                 case "pdf":
-                    return PdfConverter().convert(stream, stream_info), stream_info
+                    return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "audio":
+                    return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
         except FailedConversionAttempt:
             raise FileConversionException(
                 f"Failed to convert file of type {stream_info.magic_type}")
diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py
index ecfce92..9cbe1c9 100644
--- a/packages/markitup/src/markitup/_schemas.py
+++ b/packages/markitup/src/markitup/_schemas.py
@@ -10,6 +10,6 @@ class StreamInfo:
 
 @dataclass
 class Config:
-    modality: List[Literal["image", "audio"]] = field(
+    modalities: List[Literal["image", "audio"]] = field(
         default_factory=lambda: ["image", "audio"]
     )
diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py
index 12e533d..3c2cda5 100644
--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@@ -2,6 +2,10 @@ import os
 from io import BytesIO
 from markitup._schemas import StreamInfo
 import magic
+import speech_recognition as sr
+import pydub
+import io
+from typing import BinaryIO
 
 
 def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
@@ -100,3 +104,25 @@ def detect_file_types(file_dict):
         byte_stream.seek(original_position)
 
     return result
+
+
+def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
+    audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None
+
+    match audio_format:
+        case 'mp3':
+            audio_segment = pydub.AudioSegment.from_file(
+                file_stream, format=audio_format)
+            audio_source = io.BytesIO()
+            audio_segment.export(audio_source, format="wav")
+            audio_source.seek(0)
+        case 'wav':
+            audio_source = file_stream
+        case _:
+            raise ValueError(f"Unsupported audio format: {magic_type}")
+
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(audio_source) as source:
+        audio = recognizer.record(source)
+        transcript = recognizer.recognize_google(audio).strip()
+        return "[No speech detected]" if transcript == "" else transcript
diff --git a/packages/markitup/src/markitup/converters/__init__.py b/packages/markitup/src/markitup/converters/__init__.py
index a82b80b..5aea2af 100644
--- a/packages/markitup/src/markitup/converters/__init__.py
+++ b/packages/markitup/src/markitup/converters/__init__.py
@@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
-# from ._audio_converter import AudioConverter
+from ._audio_converter import AudioConverter
 from ._csv_converter import CsvConverter
 from ._markdownify import _CustomMarkdownify
 
@@ -27,7 +27,7 @@ __all__ = [
     "XlsConverter",
     "PptxConverter",
     "ImageConverter",
-    # "AudioConverter",
+    "AudioConverter",
     "OutlookMsgConverter",
     "ZipConverter",
     "DocumentIntelligenceConverter",
diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py
index eeff58e..eec84cd 100644
--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@@ -1,23 +1,10 @@
 import io
-from typing import Any, BinaryIO, Optional
+from typing import Any, BinaryIO, Optional, Tuple
 
-from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
+from .._schemas import StreamInfo, Config
 from .._exceptions import MissingDependencyException
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "audio/x-wav",
-    "audio/mpeg",
-    "video/mp4",
-]
-
-ACCEPTED_FILE_EXTENSIONS = [
-    ".wav",
-    ".mp3",
-    ".m4a",
-    ".mp4",
-]
+from ..converter_utils.utils import transcribe_audio
 
 
 class AudioConverter(DocumentConverter):
@@ -25,78 +12,25 @@ class AudioConverter(DocumentConverter):
     Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
     """
 
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+    def __init__(self, config: Config):
+        self.config = config
 
     def convert(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> DocumentConverterResult:
+        ** kwargs: Any,  # Options to pass to the converter
+    ) -> Tuple[DocumentConverterResult, StreamInfo]:
         md_content = ""
 
-        # Add metadata
-        metadata = exiftool_metadata(
-            file_stream, exiftool_path=kwargs.get("exiftool_path")
-        )
-        if metadata:
-            for f in [
-                "Title",
-                "Artist",
-                "Author",
-                "Band",
-                "Album",
-                "Genre",
-                "Track",
-                "DateTimeOriginal",
-                "CreateDate",
-                # "Duration", -- Wrong values when read from memory
-                "NumChannels",
-                "SampleRate",
-                "AvgBytesPerSec",
-                "BitsPerSample",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Figure out the audio format for transcription
-        if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
-            audio_format = "wav"
-        elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
-            audio_format = "mp3"
-        elif (
-            stream_info.extension in [".mp4", ".m4a"]
-            or stream_info.mimetype == "video/mp4"
-        ):
-            audio_format = "mp4"
-        else:
-            audio_format = None
-
         # Transcribe
-        if audio_format:
-            try:
-                transcript = transcribe_audio(
-                    file_stream, audio_format=audio_format)
-                if transcript:
-                    md_content += "\n\n### Audio Transcript:\n" + transcript
-            except MissingDependencyException:
-                pass
+        if 'audio' not in self.config.modalities:
+            transcript = transcribe_audio(
+                file_stream, magic_type=stream_info.magic_type)
+            if transcript:
+                md_content += "\n\n### Audio Transcript:\n" + transcript
+            return DocumentConverterResult(markdown=md_content.strip())
+        else:
+            return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
 
         # Return the result
-        return DocumentConverterResult(markdown=md_content.strip())
diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py
index 0794c8a..8839eff 100644
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@@ -3,7 +3,7 @@ import io
 import base64
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
+from .._schemas import StreamInfo, Config
 
 import fitz
 
@@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter):
     Converts PDFs to Markdown with embedded images.
     """
 
+    def __init__(self, config: Config):
+        self.config = config
+
     def convert(
         self,
         file_stream: BinaryIO,
@@ -36,28 +39,28 @@ class PdfConverter(DocumentConverter):
 
             # Extract images from the page
             image_list = page.get_images(full=True)
+            if 'image' in self.config.modalities:
+                for img_index, img_info in enumerate(image_list):
+                    xref = img_info[0]  # Get the image reference
+                    base_image = doc.extract_image(xref)
 
-            for img_index, img_info in enumerate(image_list):
-                xref = img_info[0]  # Get the image reference
-                base_image = doc.extract_image(xref)
-
-                if base_image:
-                    image_bytes = base_image["image"]
-                    image_ext = base_image["ext"]
-
-                    try:
-                        # Convert image to base64 for markdown embedding
-                        img_base64 = base64.b64encode(
-                            image_bytes).decode('utf-8')
-                        # Add image to markdown with a unique identifier
-                        image_count += 1
-                        markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
-                    except Exception as e:
-                        markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
+                    if base_image:
+                        image_bytes = base_image["image"]
+                        image_ext = base_image["ext"]
 
+                        try:
+                            # Convert image to base64 for markdown embedding
+                            img_base64 = base64.b64encode(
+                                image_bytes).decode('utf-8')
+                            # Add image to markdown with a unique identifier
+                            image_count += 1
+                            markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
+                        except Exception as e:
+                            markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
+            else:
+                markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
         # Close the document to free resources
         doc.close()
-        print(markdown_content)
         return DocumentConverterResult(
             markdown=markdown_content,
         )
diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py
index 3ee4595..5f11d37 100644
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@@ -10,7 +10,7 @@ from operator import attrgetter
 
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
+from .._schemas import StreamInfo, Config
 import pptx
 
 
@@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter):
     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
     """
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, config: Config):
         self._html_converter = HtmlConverter()
+        self.config = config
 
     def convert(
         self,
@@ -70,11 +70,15 @@ class PptxConverter(DocumentConverter):
                     alt_text = re.sub(r"\s+", " ", alt_text).strip()
 
                     # If keep_data_uris is True, use base64 encoding for images
-
-                    blob = shape.image.blob
-                    content_type = shape.image.content_type or "image/png"
-                    b64_string = base64.b64encode(blob).decode("utf-8")
-                    md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
+                    if 'image' in self.config.modalities:
+                        blob = shape.image.blob
+                        content_type = shape.image.content_type or "image/png"
+                        b64_string = base64.b64encode(blob).decode("utf-8")
+                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
+                    else:
+                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                        md_content += "\n![" + alt_text + \
+                            "](" + filename + ")\n"
 
                 # Tables
                 if self._is_table(shape):