diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index 91fd270..cbbdab0 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -1,7 +1,8 @@ +import os import tempfile from typing import Union from ._base import DocumentConverter, DocumentConverterResult -from ._wav_converter import WavConverter +from ._wav_converter import WavConverter, IS_WHISPER_CAPABLE from warnings import resetwarnings, catch_warnings # Optional Transcription support @@ -25,7 +26,8 @@ finally: class Mp3Converter(WavConverter): """ - Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). + Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), + and speech transcription (if `speech_recognition` AND `pydub` are installed, or OpenAI Whisper is configured). """ def __init__( @@ -59,18 +61,27 @@ class Mp3Converter(WavConverter): if f in metadata: md_content += f"{f}: {metadata[f]}\n" - # Transcribe - if IS_AUDIO_TRANSCRIPTION_CAPABLE: + # Try transcribing with Whisper first if OpenAI client is available + llm_client = kwargs.get("llm_client") + if IS_WHISPER_CAPABLE and llm_client is not None: + try: + transcript = self._transcribe_with_whisper(local_path, llm_client) + if transcript: + md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript + except Exception as e: + md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}" + # Fall back to speech_recognition if Whisper failed or isn't available + elif IS_AUDIO_TRANSCRIPTION_CAPABLE: handle, temp_path = tempfile.mkstemp(suffix=".wav") os.close(handle) try: sound = pydub.AudioSegment.from_mp3(local_path) sound.export(temp_path, format="wav") - + _args = dict() _args.update(kwargs) _args["file_extension"] = ".wav" - + try: transcript = super()._transcribe_audio(temp_path).strip() md_content += "\n\n### Audio Transcript:\n" + ( @@ -78,11 +89,9 @@ class Mp3Converter(WavConverter): ) except Exception: md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." - finally: os.unlink(temp_path) - # Return the result return DocumentConverterResult( title=None, text_content=md_content.strip(), diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 3c8d842..0ca2b57 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -1,20 +1,30 @@ +import logging from typing import Union from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter +logger = logging.getLogger(__name__) + # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False +IS_WHISPER_CAPABLE = False try: import speech_recognition as sr - IS_AUDIO_TRANSCRIPTION_CAPABLE = True except ModuleNotFoundError: pass +try: + from openai import OpenAI + IS_WHISPER_CAPABLE = True +except ModuleNotFoundError: + pass + class WavConverter(MediaConverter): """ - Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). + Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), + and speech transcription (if `speech_recognition` is installed or OpenAI Whisper is configured). """ def __init__( @@ -48,8 +58,17 @@ class WavConverter(MediaConverter): if f in metadata: md_content += f"{f}: {metadata[f]}\n" - # Transcribe - if IS_AUDIO_TRANSCRIPTION_CAPABLE: + # Try transcribing with Whisper first if OpenAI client is available + llm_client = kwargs.get("llm_client") + if IS_WHISPER_CAPABLE and llm_client is not None : + try: + transcript = self._transcribe_with_whisper(local_path, llm_client) + if transcript: + md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript + except Exception as e: + md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}" + # Fall back to speech_recognition if Whisper failed or isn't available + elif IS_AUDIO_TRANSCRIPTION_CAPABLE: try: transcript = self._transcribe_audio(local_path) md_content += "\n\n### Audio Transcript:\n" + ( @@ -65,6 +84,20 @@ class WavConverter(MediaConverter): text_content=md_content.strip(), ) + def _transcribe_with_whisper(self, local_path: str, client) -> str: + """Transcribe audio using OpenAI's Whisper model, falling back to speech_recognition if it fails.""" + try: + with open(local_path, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file + ) + return transcription.text.strip() + except Exception as e: + logger.warning(f"Whisper transcription attempt failed: {str(e)}") + logger.info("Falling back to speech_recognition...") + return self._transcribe_audio(local_path) + def _transcribe_audio(self, local_path) -> str: recognizer = sr.Recognizer() with sr.AudioFile(local_path) as source: diff --git a/packages/markitdown/tests/test_files/test.wav b/packages/markitdown/tests/test_files/test.wav new file mode 100644 index 0000000..bc78141 Binary files /dev/null and b/packages/markitdown/tests/test_files/test.wav differ diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index 0a3b56e..59a6c5e 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -150,6 +150,10 @@ JSON_TEST_STRINGS = [ "9700dc99-6685-40b4-9a3a-5e406dcb37f3", ] +AUDIO_TEST_STRINGS = [ + "small step", +] + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): @@ -340,6 +344,22 @@ def test_markitdown_llm() -> None: assert test_string in result.text_content.lower() +@pytest.mark.skipif( + skip_llm, + reason="do not run llm tests without a key", +) +def test_markitdown_audio_transcription() -> None: + """Test audio transcription capabilities.""" + client = openai.OpenAI() + markitdown = MarkItDown(llm_client=client) + + # Test WAV transcription with Whisper + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.wav")) + + for test_string in AUDIO_TEST_STRINGS: + assert test_string.lower() in result.text_content.lower() + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() @@ -347,4 +367,5 @@ if __name__ == "__main__": test_exceptions() test_markitdown_exiftool() # test_markitdown_llm() + # test_markitdown_audio_transcription() print("All tests passed!")