67 lines
2.1 KiB
Python
67 lines
2.1 KiB
Python
from typing import Union
|
|
from ._base import DocumentConverterResult
|
|
from ._media_converter import MediaConverter
|
|
|
|
# Optional Transcription support
|
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
|
try:
|
|
import speech_recognition as sr
|
|
|
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
|
except ModuleNotFoundError:
|
|
pass
|
|
|
|
|
|
class WavConverter(MediaConverter):
|
|
"""
|
|
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
|
"""
|
|
|
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
# Bail if not a WAV
|
|
extension = kwargs.get("file_extension", "")
|
|
if extension.lower() != ".wav":
|
|
return None
|
|
|
|
md_content = ""
|
|
|
|
# Add metadata
|
|
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
|
if metadata:
|
|
for f in [
|
|
"Title",
|
|
"Artist",
|
|
"Author",
|
|
"Band",
|
|
"Album",
|
|
"Genre",
|
|
"Track",
|
|
"DateTimeOriginal",
|
|
"CreateDate",
|
|
"Duration",
|
|
]:
|
|
if f in metadata:
|
|
md_content += f"{f}: {metadata[f]}\n"
|
|
|
|
# Transcribe
|
|
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
|
try:
|
|
transcript = self._transcribe_audio(local_path)
|
|
md_content += "\n\n### Audio Transcript:\n" + (
|
|
"[No speech detected]" if transcript == "" else transcript
|
|
)
|
|
except Exception:
|
|
md_content += (
|
|
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
)
|
|
|
|
return DocumentConverterResult(
|
|
title=None,
|
|
text_content=md_content.strip(),
|
|
)
|
|
|
|
def _transcribe_audio(self, local_path) -> str:
|
|
recognizer = sr.Recognizer()
|
|
with sr.AudioFile(local_path) as source:
|
|
audio = recognizer.record(source)
|
|
return recognizer.recognize_google(audio).strip()
|