add whisper support for audio transcript. only trigger when have llm_client and openai

2025-02-11 17:26:40 -08:00 · 2025-02-11 17:26:40 -08:00 · 8301427ab5
commit 8301427ab5
parent 97eeed5f32
2 changed files with 46 additions and 12 deletions
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,7 +1,8 @@
+import os
 import tempfile
 from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
-from ._wav_converter import WavConverter
+from ._wav_converter import WavConverter, IS_WHISPER_CAPABLE
 from warnings import resetwarnings, catch_warnings

 # Optional Transcription support
@ -25,7 +26,8 @@ finally:

 class Mp3Converter(WavConverter):
    """
-    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
+    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), 
+    and speech transcription (if `speech_recognition` AND `pydub` are installed, or OpenAI Whisper is configured).
    """

    def __init__(
@ -59,18 +61,27 @@ class Mp3Converter(WavConverter):
                if f in metadata:
                    md_content += f"{f}: {metadata[f]}\n"

-        # Transcribe
-        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
+        # Try transcribing with Whisper first if OpenAI client is available
+        llm_client = kwargs.get("llm_client")
+        if IS_WHISPER_CAPABLE and llm_client is not None:
+            try:
+                transcript = self._transcribe_with_whisper(local_path, llm_client)
+                if transcript:
+                    md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript
+            except Exception as e:
+                md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}"
+        # Fall back to speech_recognition if Whisper failed or isn't available
+        elif IS_AUDIO_TRANSCRIPTION_CAPABLE:
            handle, temp_path = tempfile.mkstemp(suffix=".wav")
            os.close(handle)
            try:
                sound = pydub.AudioSegment.from_mp3(local_path)
                sound.export(temp_path, format="wav")
-
+                
                _args = dict()
                _args.update(kwargs)
                _args["file_extension"] = ".wav"
-
+                
                try:
                    transcript = super()._transcribe_audio(temp_path).strip()
                    md_content += "\n\n### Audio Transcript:\n" + (
@ -78,11 +89,9 @@ class Mp3Converter(WavConverter):
                    )
                except Exception:
                    md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
-
            finally:
                os.unlink(temp_path)

-        # Return the result
        return DocumentConverterResult(
            title=None,
            text_content=md_content.strip(),
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@ -4,17 +4,24 @@ from ._media_converter import MediaConverter

 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
+IS_WHISPER_CAPABLE = False
 try:
    import speech_recognition as sr
-
    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass

+try:
+    from openai import OpenAI
+    IS_WHISPER_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+

 class WavConverter(MediaConverter):
    """
-    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
+    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), 
+    and speech transcription (if `speech_recognition` is installed or OpenAI Whisper is configured).
    """

    def __init__(
@ -48,8 +55,17 @@ class WavConverter(MediaConverter):
                if f in metadata:
                    md_content += f"{f}: {metadata[f]}\n"

-        # Transcribe
-        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
+        # Try transcribing with Whisper first if OpenAI client is available
+        llm_client = kwargs.get("llm_client")
+        if IS_WHISPER_CAPABLE and llm_client is not None :
+            try:
+                transcript = self._transcribe_with_whisper(local_path, llm_client)
+                if transcript:
+                    md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript
+            except Exception as e:
+                md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}"
+        # Fall back to speech_recognition if Whisper failed or isn't available
+        elif IS_AUDIO_TRANSCRIPTION_CAPABLE:
            try:
                transcript = self._transcribe_audio(local_path)
                md_content += "\n\n### Audio Transcript:\n" + (
@ -65,6 +81,15 @@ class WavConverter(MediaConverter):
            text_content=md_content.strip(),
        )

+    def _transcribe_with_whisper(self, local_path: str, client) -> str:
+        """Transcribe audio using OpenAI's Whisper model."""
+        with open(local_path, "rb") as audio_file:
+            transcription = client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file
+            )
+            return transcription.text.strip()
+
    def _transcribe_audio(self, local_path) -> str:
        recognizer = sr.Recognizer()
        with sr.AudioFile(local_path) as source: