Merge d3c3b24640 into f01c6c5277
This commit is contained in:
commit
0de59e68e3
4 changed files with 75 additions and 12 deletions
|
|
@ -1,7 +1,8 @@
|
||||||
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._wav_converter import WavConverter
|
from ._wav_converter import WavConverter, IS_WHISPER_CAPABLE
|
||||||
from warnings import resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
|
|
@ -25,7 +26,8 @@ finally:
|
||||||
|
|
||||||
class Mp3Converter(WavConverter):
|
class Mp3Converter(WavConverter):
|
||||||
"""
|
"""
|
||||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed),
|
||||||
|
and speech transcription (if `speech_recognition` AND `pydub` are installed, or OpenAI Whisper is configured).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -59,8 +61,17 @@ class Mp3Converter(WavConverter):
|
||||||
if f in metadata:
|
if f in metadata:
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
# Transcribe
|
# Try transcribing with Whisper first if OpenAI client is available
|
||||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
llm_client = kwargs.get("llm_client")
|
||||||
|
if IS_WHISPER_CAPABLE and llm_client is not None:
|
||||||
|
try:
|
||||||
|
transcript = self._transcribe_with_whisper(local_path, llm_client)
|
||||||
|
if transcript:
|
||||||
|
md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript
|
||||||
|
except Exception as e:
|
||||||
|
md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}"
|
||||||
|
# Fall back to speech_recognition if Whisper failed or isn't available
|
||||||
|
elif IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||||
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
||||||
os.close(handle)
|
os.close(handle)
|
||||||
try:
|
try:
|
||||||
|
|
@ -78,11 +89,9 @@ class Mp3Converter(WavConverter):
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
os.unlink(temp_path)
|
os.unlink(temp_path)
|
||||||
|
|
||||||
# Return the result
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=md_content.strip(),
|
text_content=md_content.strip(),
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,30 @@
|
||||||
|
import logging
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
|
IS_WHISPER_CAPABLE = False
|
||||||
try:
|
try:
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from openai import OpenAI
|
||||||
|
IS_WHISPER_CAPABLE = True
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WavConverter(MediaConverter):
|
class WavConverter(MediaConverter):
|
||||||
"""
|
"""
|
||||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed),
|
||||||
|
and speech transcription (if `speech_recognition` is installed or OpenAI Whisper is configured).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -48,8 +58,17 @@ class WavConverter(MediaConverter):
|
||||||
if f in metadata:
|
if f in metadata:
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
# Transcribe
|
# Try transcribing with Whisper first if OpenAI client is available
|
||||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
llm_client = kwargs.get("llm_client")
|
||||||
|
if IS_WHISPER_CAPABLE and llm_client is not None :
|
||||||
|
try:
|
||||||
|
transcript = self._transcribe_with_whisper(local_path, llm_client)
|
||||||
|
if transcript:
|
||||||
|
md_content += "\n\n### Audio Transcript (Whisper):\n" + transcript
|
||||||
|
except Exception as e:
|
||||||
|
md_content += f"\n\n### Audio Transcript:\nError transcribing with Whisper: {str(e)}"
|
||||||
|
# Fall back to speech_recognition if Whisper failed or isn't available
|
||||||
|
elif IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||||
try:
|
try:
|
||||||
transcript = self._transcribe_audio(local_path)
|
transcript = self._transcribe_audio(local_path)
|
||||||
md_content += "\n\n### Audio Transcript:\n" + (
|
md_content += "\n\n### Audio Transcript:\n" + (
|
||||||
|
|
@ -65,6 +84,20 @@ class WavConverter(MediaConverter):
|
||||||
text_content=md_content.strip(),
|
text_content=md_content.strip(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _transcribe_with_whisper(self, local_path: str, client) -> str:
|
||||||
|
"""Transcribe audio using OpenAI's Whisper model, falling back to speech_recognition if it fails."""
|
||||||
|
try:
|
||||||
|
with open(local_path, "rb") as audio_file:
|
||||||
|
transcription = client.audio.transcriptions.create(
|
||||||
|
model="whisper-1",
|
||||||
|
file=audio_file
|
||||||
|
)
|
||||||
|
return transcription.text.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Whisper transcription attempt failed: {str(e)}")
|
||||||
|
logger.info("Falling back to speech_recognition...")
|
||||||
|
return self._transcribe_audio(local_path)
|
||||||
|
|
||||||
def _transcribe_audio(self, local_path) -> str:
|
def _transcribe_audio(self, local_path) -> str:
|
||||||
recognizer = sr.Recognizer()
|
recognizer = sr.Recognizer()
|
||||||
with sr.AudioFile(local_path) as source:
|
with sr.AudioFile(local_path) as source:
|
||||||
|
|
|
||||||
BIN
packages/markitdown/tests/test_files/test.wav
Normal file
BIN
packages/markitdown/tests/test_files/test.wav
Normal file
Binary file not shown.
|
|
@ -150,6 +150,10 @@ JSON_TEST_STRINGS = [
|
||||||
"9700dc99-6685-40b4-9a3a-5e406dcb37f3",
|
"9700dc99-6685-40b4-9a3a-5e406dcb37f3",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
AUDIO_TEST_STRINGS = [
|
||||||
|
"small step",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# --- Helper Functions ---
|
# --- Helper Functions ---
|
||||||
def validate_strings(result, expected_strings, exclude_strings=None):
|
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||||
|
|
@ -340,6 +344,22 @@ def test_markitdown_llm() -> None:
|
||||||
assert test_string in result.text_content.lower()
|
assert test_string in result.text_content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
skip_llm,
|
||||||
|
reason="do not run llm tests without a key",
|
||||||
|
)
|
||||||
|
def test_markitdown_audio_transcription() -> None:
|
||||||
|
"""Test audio transcription capabilities."""
|
||||||
|
client = openai.OpenAI()
|
||||||
|
markitdown = MarkItDown(llm_client=client)
|
||||||
|
|
||||||
|
# Test WAV transcription with Whisper
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.wav"))
|
||||||
|
|
||||||
|
for test_string in AUDIO_TEST_STRINGS:
|
||||||
|
assert test_string.lower() in result.text_content.lower()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
|
|
@ -347,4 +367,5 @@ if __name__ == "__main__":
|
||||||
test_exceptions()
|
test_exceptions()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
# test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
|
# test_markitdown_audio_transcription()
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue