add whisper support for audio transcript. only trigger when have llm_client and openai
This commit is contained in:
parent
97eeed5f32
commit
8301427ab5
2 changed files with 46 additions and 12 deletions
|
|
@ -1,7 +1,8 @@
|
|||
import os
|
||||
import tempfile
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._wav_converter import WavConverter
|
||||
from ._wav_converter import WavConverter, IS_WHISPER_CAPABLE
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
|
||||
# Optional Transcription support
|
||||
|
|
@ -25,7 +26,8 @@ finally:
|
|||
|
||||
class Mp3Converter(WavConverter):
|
||||
"""
|
||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed),
|
||||
and speech transcription (if `speech_recognition` AND `pydub` are installed, or OpenAI Whisper is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -59,18 +61,27 @@ class Mp3Converter(WavConverter):
|
|||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Transcribe
|
||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
# Try transcribing with Whisper first if OpenAI client is available
|
||||
llm_client = kwargs.get("llm_client")
|
||||
if IS_WHISPER_CAPABLE and llm_client is not None:
|
||||
try:
|
||||
transcript = self._transcribe_with_whisper(local_path, llm_client)
|
||||
if transcript:
|
||||
md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript
|
||||
except Exception as e:
|
||||
md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}"
|
||||
# Fall back to speech_recognition if Whisper failed or isn't available
|
||||
elif IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(handle)
|
||||
try:
|
||||
sound = pydub.AudioSegment.from_mp3(local_path)
|
||||
sound.export(temp_path, format="wav")
|
||||
|
||||
|
||||
_args = dict()
|
||||
_args.update(kwargs)
|
||||
_args["file_extension"] = ".wav"
|
||||
|
||||
|
||||
try:
|
||||
transcript = super()._transcribe_audio(temp_path).strip()
|
||||
md_content += "\n\n### Audio Transcript:\n" + (
|
||||
|
|
@ -78,11 +89,9 @@ class Mp3Converter(WavConverter):
|
|||
)
|
||||
except Exception:
|
||||
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
|
|
|
|||
|
|
@ -4,17 +4,24 @@ from ._media_converter import MediaConverter
|
|||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
IS_WHISPER_CAPABLE = False
|
||||
try:
|
||||
import speech_recognition as sr
|
||||
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
IS_WHISPER_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
class WavConverter(MediaConverter):
|
||||
"""
|
||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed),
|
||||
and speech transcription (if `speech_recognition` is installed or OpenAI Whisper is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -48,8 +55,17 @@ class WavConverter(MediaConverter):
|
|||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Transcribe
|
||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
# Try transcribing with Whisper first if OpenAI client is available
|
||||
llm_client = kwargs.get("llm_client")
|
||||
if IS_WHISPER_CAPABLE and llm_client is not None :
|
||||
try:
|
||||
transcript = self._transcribe_with_whisper(local_path, llm_client)
|
||||
if transcript:
|
||||
md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript
|
||||
except Exception as e:
|
||||
md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}"
|
||||
# Fall back to speech_recognition if Whisper failed or isn't available
|
||||
elif IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
try:
|
||||
transcript = self._transcribe_audio(local_path)
|
||||
md_content += "\n\n### Audio Transcript:\n" + (
|
||||
|
|
@ -65,6 +81,15 @@ class WavConverter(MediaConverter):
|
|||
text_content=md_content.strip(),
|
||||
)
|
||||
|
||||
def _transcribe_with_whisper(self, local_path: str, client) -> str:
|
||||
"""Transcribe audio using OpenAI's Whisper model."""
|
||||
with open(local_path, "rb") as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
model="whisper-1",
|
||||
file=audio_file
|
||||
)
|
||||
return transcription.text.strip()
|
||||
|
||||
def _transcribe_audio(self, local_path) -> str:
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(local_path) as source:
|
||||
|
|
|
|||
Loading…
Reference in a new issue