diff --git a/packages/markitup/src/markitup/__init__.py b/packages/markitup/src/markitup/__init__.py index ba22cbc..b06156d 100644 --- a/packages/markitup/src/markitup/__init__.py +++ b/packages/markitup/src/markitup/__init__.py @@ -9,7 +9,6 @@ from ._markitup import ( from ._base_converter import DocumentConverterResult, DocumentConverter from ._schemas import StreamInfo, Config from ._exceptions import ( - MarkItUpException, MissingDependencyException, FailedConversionAttempt, FileConversionException, @@ -21,7 +20,6 @@ __all__ = [ "MarkItUp", "DocumentConverter", "DocumentConverterResult", - "MarkItUpException", "MissingDependencyException", "FailedConversionAttempt", "FileConversionException", diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py index 9de88df..0d5300d 100644 --- a/packages/markitup/src/markitup/_base_converter.py +++ b/packages/markitup/src/markitup/_base_converter.py @@ -4,6 +4,7 @@ from warnings import warn from typing import Any, Union, BinaryIO, Optional, List, Dict from ._schemas import StreamInfo import re +import base64 class DocumentConverterResult: @@ -11,9 +12,11 @@ class DocumentConverterResult: def __init__( self, - markdown: str, + markdown: str = "", *, title: Optional[str] = None, + audio_stream: Optional[BinaryIO] = None, + stream_info: Optional[StreamInfo] = None, ): """ Initialize the DocumentConverterResult. @@ -26,7 +29,9 @@ class DocumentConverterResult: - title: Optional title of the document. """ self.markdown = markdown + self.audio_stream = audio_stream self.title = title + self.stream_info = stream_info def to_llm(self) -> List[Dict[str, Any]]: """ @@ -79,7 +84,14 @@ class DocumentConverterResult: "type": "text", "text": text_chunk }) - + if self.audio_stream: + audio_b64 = base64.b64encode( + self.audio_stream.read()).decode('utf-8') + content.append({ + "type": "media", + "mime_type": self.stream_info.magic_type, + "data": audio_b64 + }) return content @property @@ -104,7 +116,7 @@ class DocumentConverter: self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + ** kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: """ Convert a document to Markdown text. diff --git a/packages/markitup/src/markitup/_exceptions.py b/packages/markitup/src/markitup/_exceptions.py index fca098f..b3a478e 100644 --- a/packages/markitup/src/markitup/_exceptions.py +++ b/packages/markitup/src/markitup/_exceptions.py @@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential * etc.""" -class MarkItUpException(Exception): - """ - Base exception class for MarkItUp. - """ - - pass - - -class MissingDependencyException(MarkItUpException): +class MissingDependencyException(Exception): """ Converters shipped with MarkItUp may depend on optional dependencies. This exception is thrown when a converter's @@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException): pass -class UnsupportedFormatException(MarkItUpException): +class UnsupportedFormatException(Exception): """ Thrown when no suitable converter was found for the given file. """ @@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException): pass -class FailedConversionAttempt(object): +class FailedConversionAttempt(Exception): """ - Represents an a single attempt to convert a file. + Represents a single attempt to convert a file. """ - def __init__(self, converter: Any, exc_info: Optional[tuple] = None): - self.converter = converter - self.exc_info = exc_info + def __init__(self): + super().__init__(f"Conversion attempt failed!") -class FileConversionException(MarkItUpException): +class FileConversionException(Exception): """ Thrown when a suitable converter was found, but the conversion process fails for any reason. diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index 0b5ddf0..2cfb67a 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -14,7 +14,7 @@ from .converters import ( XlsxConverter, XlsConverter, PptxConverter, - # AudioConverter, + AudioConverter, CsvConverter, ) @@ -53,9 +53,11 @@ class MarkItUp: case "text": return PlainTextConverter().convert(stream, stream_info), stream_info case "pptx": - return PptxConverter().convert(stream, stream_info), stream_info + return PptxConverter(config=self.config).convert(stream, stream_info), stream_info case "pdf": - return PdfConverter().convert(stream, stream_info), stream_info + return PdfConverter(config=self.config).convert(stream, stream_info), stream_info + case "audio": + return AudioConverter(config=self.config).convert(stream, stream_info), stream_info except FailedConversionAttempt: raise FileConversionException( f"Failed to convert file of type {stream_info.magic_type}") diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py index ecfce92..9cbe1c9 100644 --- a/packages/markitup/src/markitup/_schemas.py +++ b/packages/markitup/src/markitup/_schemas.py @@ -10,6 +10,6 @@ class StreamInfo: @dataclass class Config: - modality: List[Literal["image", "audio"]] = field( + modalities: List[Literal["image", "audio"]] = field( default_factory=lambda: ["image", "audio"] ) diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py index 12e533d..3c2cda5 100644 --- a/packages/markitup/src/markitup/converter_utils/utils.py +++ b/packages/markitup/src/markitup/converter_utils/utils.py @@ -2,6 +2,10 @@ import os from io import BytesIO from markitup._schemas import StreamInfo import magic +import speech_recognition as sr +import pydub +import io +from typing import BinaryIO def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): @@ -100,3 +104,25 @@ def detect_file_types(file_dict): byte_stream.seek(original_position) return result + + +def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str: + audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None + + match audio_format: + case 'mp3': + audio_segment = pydub.AudioSegment.from_file( + file_stream, format=audio_format) + audio_source = io.BytesIO() + audio_segment.export(audio_source, format="wav") + audio_source.seek(0) + case 'wav': + audio_source = file_stream + case _: + raise ValueError(f"Unsupported audio format: {magic_type}") + + recognizer = sr.Recognizer() + with sr.AudioFile(audio_source) as source: + audio = recognizer.record(source) + transcript = recognizer.recognize_google(audio).strip() + return "[No speech detected]" if transcript == "" else transcript diff --git a/packages/markitup/src/markitup/converters/__init__.py b/packages/markitup/src/markitup/converters/__init__.py index a82b80b..5aea2af 100644 --- a/packages/markitup/src/markitup/converters/__init__.py +++ b/packages/markitup/src/markitup/converters/__init__.py @@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter -# from ._audio_converter import AudioConverter +from ._audio_converter import AudioConverter from ._csv_converter import CsvConverter from ._markdownify import _CustomMarkdownify @@ -27,7 +27,7 @@ __all__ = [ "XlsConverter", "PptxConverter", "ImageConverter", - # "AudioConverter", + "AudioConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py index eeff58e..eec84cd 100644 --- a/packages/markitup/src/markitup/converters/_audio_converter.py +++ b/packages/markitup/src/markitup/converters/_audio_converter.py @@ -1,23 +1,10 @@ import io -from typing import Any, BinaryIO, Optional +from typing import Any, BinaryIO, Optional, Tuple -from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config from .._exceptions import MissingDependencyException - -ACCEPTED_MIME_TYPE_PREFIXES = [ - "audio/x-wav", - "audio/mpeg", - "video/mp4", -] - -ACCEPTED_FILE_EXTENSIONS = [ - ".wav", - ".mp3", - ".m4a", - ".mp4", -] +from ..converter_utils.utils import transcribe_audio class AudioConverter(DocumentConverter): @@ -25,78 +12,25 @@ class AudioConverter(DocumentConverter): Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + def __init__(self, config: Config): + self.config = config def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> DocumentConverterResult: + ** kwargs: Any, # Options to pass to the converter + ) -> Tuple[DocumentConverterResult, StreamInfo]: md_content = "" - # Add metadata - metadata = exiftool_metadata( - file_stream, exiftool_path=kwargs.get("exiftool_path") - ) - if metadata: - for f in [ - "Title", - "Artist", - "Author", - "Band", - "Album", - "Genre", - "Track", - "DateTimeOriginal", - "CreateDate", - # "Duration", -- Wrong values when read from memory - "NumChannels", - "SampleRate", - "AvgBytesPerSec", - "BitsPerSample", - ]: - if f in metadata: - md_content += f"{f}: {metadata[f]}\n" - - # Figure out the audio format for transcription - if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav": - audio_format = "wav" - elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg": - audio_format = "mp3" - elif ( - stream_info.extension in [".mp4", ".m4a"] - or stream_info.mimetype == "video/mp4" - ): - audio_format = "mp4" - else: - audio_format = None - # Transcribe - if audio_format: - try: - transcript = transcribe_audio( - file_stream, audio_format=audio_format) - if transcript: - md_content += "\n\n### Audio Transcript:\n" + transcript - except MissingDependencyException: - pass + if 'audio' not in self.config.modalities: + transcript = transcribe_audio( + file_stream, magic_type=stream_info.magic_type) + if transcript: + md_content += "\n\n### Audio Transcript:\n" + transcript + return DocumentConverterResult(markdown=md_content.strip()) + else: + return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info) # Return the result - return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py index 0794c8a..8839eff 100644 --- a/packages/markitup/src/markitup/converters/_pdf_converter.py +++ b/packages/markitup/src/markitup/converters/_pdf_converter.py @@ -3,7 +3,7 @@ import io import base64 from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config import fitz @@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter): Converts PDFs to Markdown with embedded images. """ + def __init__(self, config: Config): + self.config = config + def convert( self, file_stream: BinaryIO, @@ -36,28 +39,28 @@ class PdfConverter(DocumentConverter): # Extract images from the page image_list = page.get_images(full=True) + if 'image' in self.config.modalities: + for img_index, img_info in enumerate(image_list): + xref = img_info[0] # Get the image reference + base_image = doc.extract_image(xref) - for img_index, img_info in enumerate(image_list): - xref = img_info[0] # Get the image reference - base_image = doc.extract_image(xref) - - if base_image: - image_bytes = base_image["image"] - image_ext = base_image["ext"] - - try: - # Convert image to base64 for markdown embedding - img_base64 = base64.b64encode( - image_bytes).decode('utf-8') - # Add image to markdown with a unique identifier - image_count += 1 - markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" - except Exception as e: - markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" + if base_image: + image_bytes = base_image["image"] + image_ext = base_image["ext"] + try: + # Convert image to base64 for markdown embedding + img_base64 = base64.b64encode( + image_bytes).decode('utf-8') + # Add image to markdown with a unique identifier + image_count += 1 + markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" + except Exception as e: + markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" + else: + markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n" # Close the document to free resources doc.close() - print(markdown_content) return DocumentConverterResult( markdown=markdown_content, ) diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index 3ee4595..5f11d37 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -10,7 +10,7 @@ from operator import attrgetter from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config import pptx @@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ - def __init__(self): - super().__init__() + def __init__(self, config: Config): self._html_converter = HtmlConverter() + self.config = config def convert( self, @@ -70,11 +70,15 @@ class PptxConverter(DocumentConverter): alt_text = re.sub(r"\s+", " ", alt_text).strip() # If keep_data_uris is True, use base64 encoding for images - - blob = shape.image.blob - content_type = shape.image.content_type or "image/png" - b64_string = base64.b64encode(blob).decode("utf-8") - md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + if 'image' in self.config.modalities: + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + else: + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + \ + "](" + filename + ")\n" # Tables if self._is_table(shape):