Added support for vaious audio files.
This commit is contained in:
parent
c426cb81b3
commit
a9ceb13feb
11 changed files with 363 additions and 197 deletions
|
|
@ -4,9 +4,6 @@ from warnings import warn
|
||||||
from typing import Any, Union, BinaryIO, Optional, List
|
from typing import Any, Union, BinaryIO, Optional, List
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
|
|
||||||
# Avoid printing the same warning multiple times
|
|
||||||
_WARNED: List[str] = []
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
class DocumentConverterResult:
|
||||||
"""The result of converting a document to Markdown."""
|
"""The result of converting a document to Markdown."""
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ from warnings import warn
|
||||||
import puremagic
|
import puremagic
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo, _guess_stream_info_from_stream
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
|
|
@ -254,7 +254,7 @@ class MarkItDown:
|
||||||
with open(path, "rb") as fh:
|
with open(path, "rb") as fh:
|
||||||
# Prepare a list of configurations to try, starting with the base_stream_info
|
# Prepare a list of configurations to try, starting with the base_stream_info
|
||||||
guesses: List[StreamInfo] = [base_stream_info]
|
guesses: List[StreamInfo] = [base_stream_info]
|
||||||
for guess in StreamInfo.guess_from_stream(
|
for guess in _guess_stream_info_from_stream(
|
||||||
file_stream=fh, filename_hint=path
|
file_stream=fh, filename_hint=path
|
||||||
):
|
):
|
||||||
guesses.append(base_stream_info.copy_and_update(guess))
|
guesses.append(base_stream_info.copy_and_update(guess))
|
||||||
|
|
@ -298,7 +298,7 @@ class MarkItDown:
|
||||||
placeholder_filename = "placeholder" + base_guess.extension
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
# Add guesses based on stream content
|
# Add guesses based on stream content
|
||||||
for guess in StreamInfo.guess_from_stream(
|
for guess in _guess_stream_info_from_stream(
|
||||||
file_stream=stream, filename_hint=placeholder_filename
|
file_stream=stream, filename_hint=placeholder_filename
|
||||||
):
|
):
|
||||||
guesses.append(base_guess.copy_and_update(guess))
|
guesses.append(base_guess.copy_and_update(guess))
|
||||||
|
|
@ -393,7 +393,7 @@ class MarkItDown:
|
||||||
placeholder_filename = "placeholder" + base_guess.extension
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
# Add guesses based on stream content
|
# Add guesses based on stream content
|
||||||
for guess in StreamInfo.guess_from_stream(
|
for guess in _guess_stream_info_from_stream(
|
||||||
file_stream=buffer, filename_hint=placeholder_filename
|
file_stream=buffer, filename_hint=placeholder_filename
|
||||||
):
|
):
|
||||||
guesses.append(base_guess.copy_and_update(guess))
|
guesses.append(base_guess.copy_and_update(guess))
|
||||||
|
|
|
||||||
|
|
@ -43,10 +43,14 @@ class StreamInfo:
|
||||||
|
|
||||||
return StreamInfo(**new_info)
|
return StreamInfo(**new_info)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def guess_from_stream(
|
# Behavior subject to change.
|
||||||
cls: Type[T], file_stream: BinaryIO, *, filename_hint: Optional[str] = None
|
# Do not rely on this outside of this module.
|
||||||
) -> List[T]:
|
def _guess_stream_info_from_stream(
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
*,
|
||||||
|
filename_hint: Optional[str] = None,
|
||||||
|
) -> List[StreamInfo]:
|
||||||
"""
|
"""
|
||||||
Guess StreamInfo properties (mostly mimetype and extension) from a stream.
|
Guess StreamInfo properties (mostly mimetype and extension) from a stream.
|
||||||
|
|
||||||
|
|
@ -67,7 +71,9 @@ class StreamInfo:
|
||||||
|
|
||||||
if mimetype:
|
if mimetype:
|
||||||
guesses.append(
|
guesses.append(
|
||||||
cls(mimetype=mimetype, extension=os.path.splitext(filename_hint)[1])
|
StreamInfo(
|
||||||
|
mimetype=mimetype, extension=os.path.splitext(filename_hint)[1]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _puremagic(
|
def _puremagic(
|
||||||
|
|
@ -113,6 +119,6 @@ class StreamInfo:
|
||||||
if len(kwargs) > 0:
|
if len(kwargs) > 0:
|
||||||
# We don't add the filename_hint, because sometimes it's just a placeholder,
|
# We don't add the filename_hint, because sometimes it's just a placeholder,
|
||||||
# and, in any case, doesn't add new information.
|
# and, in any case, doesn't add new information.
|
||||||
guesses.append(cls(**kwargs))
|
guesses.append(StreamInfo(**kwargs))
|
||||||
|
|
||||||
return guesses
|
return guesses
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,12 @@
|
||||||
from typing import Any, Union
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
from typing import BinaryIO, Any, List
|
||||||
|
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
|
|
@ -26,6 +29,40 @@ except ImportError:
|
||||||
CONTENT_FORMAT = "markdown"
|
CONTENT_FORMAT = "markdown"
|
||||||
|
|
||||||
|
|
||||||
|
OFFICE_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||||
|
"application/xhtml",
|
||||||
|
"text/html",
|
||||||
|
]
|
||||||
|
|
||||||
|
OTHER_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/pdf",
|
||||||
|
"application/x-pdf",
|
||||||
|
"text/html",
|
||||||
|
"image/",
|
||||||
|
]
|
||||||
|
|
||||||
|
OFFICE_FILE_EXTENSIONS = [
|
||||||
|
".docx",
|
||||||
|
".xlsx",
|
||||||
|
".pptx",
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
OTHER_FILE_EXTENSIONS = [
|
||||||
|
".pdf",
|
||||||
|
".jpeg",
|
||||||
|
".jpg",
|
||||||
|
".png",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".heif",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class DocumentIntelligenceConverter(DocumentConverter):
|
class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||||
|
|
||||||
|
|
@ -57,46 +94,57 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
self._priority = priority
|
self._priority = priority
|
||||||
|
|
||||||
def convert(
|
def accepts(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if extension is not supported by Document Intelligence
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
docintel_extensions = [
|
) -> bool:
|
||||||
".pdf",
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
".docx",
|
extension = (stream_info.extension or "").lower()
|
||||||
".xlsx",
|
|
||||||
".pptx",
|
|
||||||
".html",
|
|
||||||
".jpeg",
|
|
||||||
".jpg",
|
|
||||||
".png",
|
|
||||||
".bmp",
|
|
||||||
".tiff",
|
|
||||||
".heif",
|
|
||||||
]
|
|
||||||
if extension.lower() not in docintel_extensions:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get the bytestring for the local path
|
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
|
||||||
with open(local_path, "rb") as f:
|
return True
|
||||||
file_bytes = f.read()
|
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
if mimetype.startswith(prefix):
|
||||||
analysis_features = []
|
return True
|
||||||
else:
|
|
||||||
analysis_features = [
|
return False
|
||||||
|
|
||||||
|
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
|
||||||
|
"""
|
||||||
|
Helper needed to determine which analysis features to use.
|
||||||
|
Certain document analysis features are not availiable for
|
||||||
|
office filetypes (.xlsx, .pptx, .html, .docx)
|
||||||
|
"""
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in OFFICE_FILE_EXTENSIONS:
|
||||||
|
return []
|
||||||
|
|
||||||
|
for prefix in OFFICE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Extract the text using Azure Document Intelligence
|
# Extract the text using Azure Document Intelligence
|
||||||
poller = self.doc_intel_client.begin_analyze_document(
|
poller = self.doc_intel_client.begin_analyze_document(
|
||||||
model_id="prebuilt-layout",
|
model_id="prebuilt-layout",
|
||||||
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
|
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
|
||||||
features=analysis_features,
|
features=self._analysis_features(stream_info),
|
||||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||||
)
|
)
|
||||||
result: AnalyzeResult = poller.result()
|
result: AnalyzeResult = poller.result()
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from ._wav_converter import WavConverter
|
|
||||||
from warnings import resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
|
|
@ -23,64 +22,65 @@ finally:
|
||||||
resetwarnings()
|
resetwarnings()
|
||||||
|
|
||||||
|
|
||||||
class Mp3Converter(WavConverter):
|
class Mp3Converter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
# def __init__(
|
||||||
# Bail if not a MP3
|
# self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
extension = kwargs.get("file_extension", "")
|
# ):
|
||||||
if extension.lower() != ".mp3":
|
# super().__init__(priority=priority)
|
||||||
return None
|
#
|
||||||
|
# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
md_content = ""
|
# # Bail if not a MP3
|
||||||
|
# extension = kwargs.get("file_extension", "")
|
||||||
# Add metadata
|
# if extension.lower() != ".mp3":
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
# return None
|
||||||
if metadata:
|
#
|
||||||
for f in [
|
# md_content = ""
|
||||||
"Title",
|
#
|
||||||
"Artist",
|
# # Add metadata
|
||||||
"Author",
|
# metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||||
"Band",
|
# if metadata:
|
||||||
"Album",
|
# for f in [
|
||||||
"Genre",
|
# "Title",
|
||||||
"Track",
|
# "Artist",
|
||||||
"DateTimeOriginal",
|
# "Author",
|
||||||
"CreateDate",
|
# "Band",
|
||||||
"Duration",
|
# "Album",
|
||||||
]:
|
# "Genre",
|
||||||
if f in metadata:
|
# "Track",
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
# "DateTimeOriginal",
|
||||||
|
# "CreateDate",
|
||||||
# Transcribe
|
# "Duration",
|
||||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
# ]:
|
||||||
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
# if f in metadata:
|
||||||
os.close(handle)
|
# md_content += f"{f}: {metadata[f]}\n"
|
||||||
try:
|
#
|
||||||
sound = pydub.AudioSegment.from_mp3(local_path)
|
# # Transcribe
|
||||||
sound.export(temp_path, format="wav")
|
# if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||||
|
# handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
||||||
_args = dict()
|
# os.close(handle)
|
||||||
_args.update(kwargs)
|
# try:
|
||||||
_args["file_extension"] = ".wav"
|
# sound = pydub.AudioSegment.from_mp3(local_path)
|
||||||
|
# sound.export(temp_path, format="wav")
|
||||||
try:
|
#
|
||||||
transcript = super()._transcribe_audio(temp_path).strip()
|
# _args = dict()
|
||||||
md_content += "\n\n### Audio Transcript:\n" + (
|
# _args.update(kwargs)
|
||||||
"[No speech detected]" if transcript == "" else transcript
|
# _args["file_extension"] = ".wav"
|
||||||
)
|
#
|
||||||
except Exception:
|
# try:
|
||||||
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
# transcript = super()._transcribe_audio(temp_path).strip()
|
||||||
|
# md_content += "\n\n### Audio Transcript:\n" + (
|
||||||
finally:
|
# "[No speech detected]" if transcript == "" else transcript
|
||||||
os.unlink(temp_path)
|
# )
|
||||||
|
# except Exception:
|
||||||
# Return the result
|
# md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
#
|
||||||
|
# finally:
|
||||||
|
# os.unlink(temp_path)
|
||||||
|
#
|
||||||
|
# # Return the result
|
||||||
|
# return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
import io
|
||||||
|
from typing import BinaryIO
|
||||||
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import speech_recognition as sr
|
||||||
|
import pydub
|
||||||
|
except ImportError:
|
||||||
|
# Preserve the error and stack trace for later
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str:
|
||||||
|
# Check for installed dependencies
|
||||||
|
if _dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||||
|
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
|
||||||
|
|
||||||
|
if audio_format in ["wav", "aiff", "flac"]:
|
||||||
|
audio_source = file_stream
|
||||||
|
elif audio_format in ["mp3", "mp4"]:
|
||||||
|
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
|
||||||
|
|
||||||
|
audio_source = io.BytesIO()
|
||||||
|
audio_segment.export(audio_source, format="wav")
|
||||||
|
audio_source.seek(0)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||||
|
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
with sr.AudioFile(audio_source) as source:
|
||||||
|
audio = recognizer.record(source)
|
||||||
|
transcript = recognizer.recognize_google(audio).strip()
|
||||||
|
return "[No speech detected]" if transcript == "" else transcript
|
||||||
|
|
@ -1,18 +1,27 @@
|
||||||
from typing import Union
|
import io
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
|
|
||||||
|
from ._exiftool import exiftool_metadata
|
||||||
|
from ._transcribe_audio import transcribe_audio
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
# Optional Transcription support
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
"audio/x-wav",
|
||||||
try:
|
"audio/mpeg",
|
||||||
import speech_recognition as sr
|
"video/mp4",
|
||||||
|
]
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
except ModuleNotFoundError:
|
".wav",
|
||||||
pass
|
".mp3",
|
||||||
|
".m4a",
|
||||||
|
".mp4",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class WavConverter(MediaConverter):
|
class WavConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
"""
|
"""
|
||||||
|
|
@ -22,16 +31,40 @@ class WavConverter(MediaConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a WAV
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() != ".wav":
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Make sure we're dealing with HTML content *from* Wikipedia.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
metadata = exiftool_metadata(
|
||||||
|
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||||
|
)
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
"Title",
|
"Title",
|
||||||
|
|
@ -43,27 +76,36 @@ class WavConverter(MediaConverter):
|
||||||
"Track",
|
"Track",
|
||||||
"DateTimeOriginal",
|
"DateTimeOriginal",
|
||||||
"CreateDate",
|
"CreateDate",
|
||||||
"Duration",
|
# "Duration", -- Wrong values when read from memory
|
||||||
|
"NumChannels",
|
||||||
|
"SampleRate",
|
||||||
|
"AvgBytesPerSec",
|
||||||
|
"BitsPerSample",
|
||||||
]:
|
]:
|
||||||
if f in metadata:
|
if f in metadata:
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
|
# Figure out the audio format for transcription
|
||||||
|
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
||||||
|
audio_format = "wav"
|
||||||
|
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
||||||
|
audio_format = "mp3"
|
||||||
|
elif (
|
||||||
|
stream_info.extension in [".mp4", ".m4a"]
|
||||||
|
or stream_info.mimetype == "video/mp4"
|
||||||
|
):
|
||||||
|
audio_format = "mp4"
|
||||||
|
else:
|
||||||
|
audio_format = None
|
||||||
|
|
||||||
# Transcribe
|
# Transcribe
|
||||||
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
if audio_format:
|
||||||
try:
|
try:
|
||||||
transcript = self._transcribe_audio(local_path)
|
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
||||||
md_content += "\n\n### Audio Transcript:\n" + (
|
if transcript:
|
||||||
"[No speech detected]" if transcript == "" else transcript
|
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||||
)
|
except MissingDependencyException:
|
||||||
except Exception:
|
pass
|
||||||
md_content += (
|
|
||||||
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# Return the result
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
||||||
def _transcribe_audio(self, local_path) -> str:
|
|
||||||
recognizer = sr.Recognizer()
|
|
||||||
with sr.AudioFile(local_path) as source:
|
|
||||||
audio = recognizer.record(source)
|
|
||||||
return recognizer.recognize_google(audio).strip()
|
|
||||||
|
|
|
||||||
BIN
packages/markitdown/tests/test_files/test.m4a
Executable file
BIN
packages/markitdown/tests/test_files/test.m4a
Executable file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.mp3
Normal file
BIN
packages/markitdown/tests/test_files/test.mp3
Normal file
Binary file not shown.
BIN
packages/markitdown/tests/test_files/test.wav
Normal file
BIN
packages/markitdown/tests/test_files/test.wav
Normal file
Binary file not shown.
|
|
@ -15,6 +15,7 @@ from markitdown import (
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
StreamInfo,
|
StreamInfo,
|
||||||
)
|
)
|
||||||
|
from markitdown._stream_info import _guess_stream_info_from_stream
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||||
|
|
@ -41,6 +42,13 @@ JPG_TEST_EXIFTOOL = {
|
||||||
"DateTimeOriginal": "2024:03:14 22:10:00",
|
"DateTimeOriginal": "2024:03:14 22:10:00",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MP3_TEST_EXIFTOOL = {
|
||||||
|
"Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e",
|
||||||
|
"Artist": "Artist Name Test String",
|
||||||
|
"Album": "Album Name Test String",
|
||||||
|
"SampleRate": "48000",
|
||||||
|
}
|
||||||
|
|
||||||
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||||
PDF_TEST_STRINGS = [
|
PDF_TEST_STRINGS = [
|
||||||
"While there is contemporaneous exploration of multi-agent approaches"
|
"While there is contemporaneous exploration of multi-agent approaches"
|
||||||
|
|
@ -261,7 +269,7 @@ def test_stream_info_guesses() -> None:
|
||||||
|
|
||||||
for file_path, expected_mimetype in test_tuples:
|
for file_path, expected_mimetype in test_tuples:
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
guesses = StreamInfo.guess_from_stream(
|
guesses = _guess_stream_info_from_stream(
|
||||||
f, filename_hint=os.path.basename(file_path)
|
f, filename_hint=os.path.basename(file_path)
|
||||||
)
|
)
|
||||||
assert len(guesses) > 0
|
assert len(guesses) > 0
|
||||||
|
|
@ -389,6 +397,26 @@ def test_markitdown_local() -> None:
|
||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
skip_remote,
|
||||||
|
reason="do not run remotely run speech transcription tests",
|
||||||
|
)
|
||||||
|
def test_speech_transcription() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test WAV files, MP3 and M4A files
|
||||||
|
for file_name in ["test.wav", "test.mp3", "test.m4a"]:
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name))
|
||||||
|
result_lower = result.text_content.lower()
|
||||||
|
assert (
|
||||||
|
("1" in result_lower or "one" in result_lower)
|
||||||
|
and ("2" in result_lower or "two" in result_lower)
|
||||||
|
and ("3" in result_lower or "three" in result_lower)
|
||||||
|
and ("4" in result_lower or "four" in result_lower)
|
||||||
|
and ("5" in result_lower or "five" in result_lower)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_exceptions() -> None:
|
def test_exceptions() -> None:
|
||||||
# Check that an exception is raised when trying to convert an unsupported format
|
# Check that an exception is raised when trying to convert an unsupported format
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
@ -437,6 +465,12 @@ def test_markitdown_exiftool() -> None:
|
||||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||||
assert target in result.text_content
|
assert target in result.text_content
|
||||||
|
|
||||||
|
# Test some other media types
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3"))
|
||||||
|
for key in MP3_TEST_EXIFTOOL:
|
||||||
|
target = f"{key}: {MP3_TEST_EXIFTOOL[key]}"
|
||||||
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_llm,
|
skip_llm,
|
||||||
|
|
@ -470,6 +504,7 @@ if __name__ == "__main__":
|
||||||
test_stream_info_guesses()
|
test_stream_info_guesses()
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local()
|
||||||
|
test_speech_transcription()
|
||||||
test_exceptions()
|
test_exceptions()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
test_markitdown_llm()
|
test_markitdown_llm()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue