Added support for vaious audio files.

This commit is contained in:
Adam Fourney 2025-03-05 10:15:42 -08:00
parent c426cb81b3
commit a9ceb13feb
11 changed files with 363 additions and 197 deletions

View file

@ -4,9 +4,6 @@ from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List from typing import Any, Union, BinaryIO, Optional, List
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
# Avoid printing the same warning multiple times
_WARNED: List[str] = []
class DocumentConverterResult: class DocumentConverterResult:
"""The result of converting a document to Markdown.""" """The result of converting a document to Markdown."""

View file

@ -17,7 +17,7 @@ from warnings import warn
import puremagic import puremagic
import requests import requests
from ._stream_info import StreamInfo from ._stream_info import StreamInfo, _guess_stream_info_from_stream
from .converters import ( from .converters import (
PlainTextConverter, PlainTextConverter,
@ -254,7 +254,7 @@ class MarkItDown:
with open(path, "rb") as fh: with open(path, "rb") as fh:
# Prepare a list of configurations to try, starting with the base_stream_info # Prepare a list of configurations to try, starting with the base_stream_info
guesses: List[StreamInfo] = [base_stream_info] guesses: List[StreamInfo] = [base_stream_info]
for guess in StreamInfo.guess_from_stream( for guess in _guess_stream_info_from_stream(
file_stream=fh, filename_hint=path file_stream=fh, filename_hint=path
): ):
guesses.append(base_stream_info.copy_and_update(guess)) guesses.append(base_stream_info.copy_and_update(guess))
@ -298,7 +298,7 @@ class MarkItDown:
placeholder_filename = "placeholder" + base_guess.extension placeholder_filename = "placeholder" + base_guess.extension
# Add guesses based on stream content # Add guesses based on stream content
for guess in StreamInfo.guess_from_stream( for guess in _guess_stream_info_from_stream(
file_stream=stream, filename_hint=placeholder_filename file_stream=stream, filename_hint=placeholder_filename
): ):
guesses.append(base_guess.copy_and_update(guess)) guesses.append(base_guess.copy_and_update(guess))
@ -393,7 +393,7 @@ class MarkItDown:
placeholder_filename = "placeholder" + base_guess.extension placeholder_filename = "placeholder" + base_guess.extension
# Add guesses based on stream content # Add guesses based on stream content
for guess in StreamInfo.guess_from_stream( for guess in _guess_stream_info_from_stream(
file_stream=buffer, filename_hint=placeholder_filename file_stream=buffer, filename_hint=placeholder_filename
): ):
guesses.append(base_guess.copy_and_update(guess)) guesses.append(base_guess.copy_and_update(guess))

View file

@ -43,10 +43,14 @@ class StreamInfo:
return StreamInfo(**new_info) return StreamInfo(**new_info)
@classmethod
def guess_from_stream( # Behavior subject to change.
cls: Type[T], file_stream: BinaryIO, *, filename_hint: Optional[str] = None # Do not rely on this outside of this module.
) -> List[T]: def _guess_stream_info_from_stream(
file_stream: BinaryIO,
*,
filename_hint: Optional[str] = None,
) -> List[StreamInfo]:
""" """
Guess StreamInfo properties (mostly mimetype and extension) from a stream. Guess StreamInfo properties (mostly mimetype and extension) from a stream.
@ -67,7 +71,9 @@ class StreamInfo:
if mimetype: if mimetype:
guesses.append( guesses.append(
cls(mimetype=mimetype, extension=os.path.splitext(filename_hint)[1]) StreamInfo(
mimetype=mimetype, extension=os.path.splitext(filename_hint)[1]
)
) )
def _puremagic( def _puremagic(
@ -113,6 +119,6 @@ class StreamInfo:
if len(kwargs) > 0: if len(kwargs) > 0:
# We don't add the filename_hint, because sometimes it's just a placeholder, # We don't add the filename_hint, because sometimes it's just a placeholder,
# and, in any case, doesn't add new information. # and, in any case, doesn't add new information.
guesses.append(cls(**kwargs)) guesses.append(StreamInfo(**kwargs))
return guesses return guesses

View file

@ -1,9 +1,12 @@
from typing import Any, Union
import re
import sys import sys
import re
from typing import BinaryIO, Any, List
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
@ -26,6 +29,40 @@ except ImportError:
CONTENT_FORMAT = "markdown" CONTENT_FORMAT = "markdown"
OFFICE_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml",
"application/xhtml",
"text/html",
]
OTHER_MIME_TYPE_PREFIXES = [
"application/pdf",
"application/x-pdf",
"text/html",
"image/",
]
OFFICE_FILE_EXTENSIONS = [
".docx",
".xlsx",
".pptx",
".html",
".htm",
]
OTHER_FILE_EXTENSIONS = [
".pdf",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
class DocumentIntelligenceConverter(DocumentConverter): class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
@ -57,46 +94,57 @@ class DocumentIntelligenceConverter(DocumentConverter):
) )
self._priority = priority self._priority = priority
def convert( def accepts(
self, local_path: str, **kwargs: Any self,
) -> Union[None, DocumentConverterResult]: file_stream: BinaryIO,
# Bail if extension is not supported by Document Intelligence stream_info: StreamInfo,
extension = kwargs.get("file_extension", "") **kwargs: Any, # Options to pass to the converter
docintel_extensions = [ ) -> bool:
".pdf", mimetype = (stream_info.mimetype or "").lower()
".docx", extension = (stream_info.extension or "").lower()
".xlsx",
".pptx",
".html",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
if extension.lower() not in docintel_extensions:
return None
# Get the bytestring for the local path if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
with open(local_path, "rb") as f: return True
file_bytes = f.read()
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: if mimetype.startswith(prefix):
analysis_features = [] return True
else:
analysis_features = [ return False
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
"""
Helper needed to determine which analysis features to use.
Certain document analysis features are not availiable for
office filetypes (.xlsx, .pptx, .html, .docx)
"""
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS:
return []
for prefix in OFFICE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return []
return [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
] ]
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Extract the text using Azure Document Intelligence # Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document( poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout", model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_bytes), body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
features=analysis_features, features=self._analysis_features(stream_info),
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
) )
result: AnalyzeResult = poller.result() result: AnalyzeResult = poller.result()

View file

@ -1,7 +1,6 @@
import tempfile import tempfile
from typing import Union from typing import Union
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings from warnings import resetwarnings, catch_warnings
# Optional Transcription support # Optional Transcription support
@ -23,64 +22,65 @@ finally:
resetwarnings() resetwarnings()
class Mp3Converter(WavConverter): class Mp3Converter(DocumentConverter):
""" """
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # def __init__(
# Bail if not a MP3 # self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
extension = kwargs.get("file_extension", "") # ):
if extension.lower() != ".mp3": # super().__init__(priority=priority)
return None #
# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = "" # # Bail if not a MP3
# extension = kwargs.get("file_extension", "")
# Add metadata # if extension.lower() != ".mp3":
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) # return None
if metadata: #
for f in [ # md_content = ""
"Title", #
"Artist", # # Add metadata
"Author", # metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
"Band", # if metadata:
"Album", # for f in [
"Genre", # "Title",
"Track", # "Artist",
"DateTimeOriginal", # "Author",
"CreateDate", # "Band",
"Duration", # "Album",
]: # "Genre",
if f in metadata: # "Track",
md_content += f"{f}: {metadata[f]}\n" # "DateTimeOriginal",
# "CreateDate",
# Transcribe # "Duration",
if IS_AUDIO_TRANSCRIPTION_CAPABLE: # ]:
handle, temp_path = tempfile.mkstemp(suffix=".wav") # if f in metadata:
os.close(handle) # md_content += f"{f}: {metadata[f]}\n"
try: #
sound = pydub.AudioSegment.from_mp3(local_path) # # Transcribe
sound.export(temp_path, format="wav") # if IS_AUDIO_TRANSCRIPTION_CAPABLE:
# handle, temp_path = tempfile.mkstemp(suffix=".wav")
_args = dict() # os.close(handle)
_args.update(kwargs) # try:
_args["file_extension"] = ".wav" # sound = pydub.AudioSegment.from_mp3(local_path)
# sound.export(temp_path, format="wav")
try: #
transcript = super()._transcribe_audio(temp_path).strip() # _args = dict()
md_content += "\n\n### Audio Transcript:\n" + ( # _args.update(kwargs)
"[No speech detected]" if transcript == "" else transcript # _args["file_extension"] = ".wav"
) #
except Exception: # try:
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." # transcript = super()._transcribe_audio(temp_path).strip()
# md_content += "\n\n### Audio Transcript:\n" + (
finally: # "[No speech detected]" if transcript == "" else transcript
os.unlink(temp_path) # )
# except Exception:
# Return the result # md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
return DocumentConverterResult(markdown=md_content.strip()) #
# finally:
# os.unlink(temp_path)
#
# # Return the result
# return DocumentConverterResult(markdown=md_content.strip())

View file

@ -0,0 +1,38 @@
import io
from typing import BinaryIO
from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import speech_recognition as sr
import pydub
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str:
# Check for installed dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
if audio_format in ["wav", "aiff", "flac"]:
audio_source = file_stream
elif audio_format in ["mp3", "mp4"]:
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
audio_source = io.BytesIO()
audio_segment.export(audio_source, format="wav")
audio_source.seek(0)
else:
raise ValueError(f"Unsupported audio format: {audio_format}")
recognizer = sr.Recognizer()
with sr.AudioFile(audio_source) as source:
audio = recognizer.record(source)
transcript = recognizer.recognize_google(audio).strip()
return "[No speech detected]" if transcript == "" else transcript

View file

@ -1,18 +1,27 @@
from typing import Union import io
from typing import Any, BinaryIO, Optional
from ._exiftool import exiftool_metadata
from ._transcribe_audio import transcribe_audio
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException
# Optional Transcription support ACCEPTED_MIME_TYPE_PREFIXES = [
IS_AUDIO_TRANSCRIPTION_CAPABLE = False "audio/x-wav",
try: "audio/mpeg",
import speech_recognition as sr "video/mp4",
]
IS_AUDIO_TRANSCRIPTION_CAPABLE = True ACCEPTED_FILE_EXTENSIONS = [
except ModuleNotFoundError: ".wav",
pass ".mp3",
".m4a",
".mp4",
]
class WavConverter(MediaConverter): class WavConverter(DocumentConverter):
""" """
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
""" """
@ -22,16 +31,40 @@ class WavConverter(MediaConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def accepts(
# Bail if not a WAV self,
extension = kwargs.get("file_extension", "") file_stream: BinaryIO,
if extension.lower() != ".wav": stream_info: StreamInfo,
return None **kwargs: Any, # Options to pass to the converter
) -> bool:
"""
Make sure we're dealing with HTML content *from* Wikipedia.
"""
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
md_content = "" md_content = ""
# Add metadata # Add metadata
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) metadata = exiftool_metadata(
file_stream, exiftool_path=kwargs.get("exiftool_path")
)
if metadata: if metadata:
for f in [ for f in [
"Title", "Title",
@ -43,27 +76,36 @@ class WavConverter(MediaConverter):
"Track", "Track",
"DateTimeOriginal", "DateTimeOriginal",
"CreateDate", "CreateDate",
"Duration", # "Duration", -- Wrong values when read from memory
"NumChannels",
"SampleRate",
"AvgBytesPerSec",
"BitsPerSample",
]: ]:
if f in metadata: if f in metadata:
md_content += f"{f}: {metadata[f]}\n" md_content += f"{f}: {metadata[f]}\n"
# Figure out the audio format for transcription
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
audio_format = "wav"
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
audio_format = "mp3"
elif (
stream_info.extension in [".mp4", ".m4a"]
or stream_info.mimetype == "video/mp4"
):
audio_format = "mp4"
else:
audio_format = None
# Transcribe # Transcribe
if IS_AUDIO_TRANSCRIPTION_CAPABLE: if audio_format:
try: try:
transcript = self._transcribe_audio(local_path) transcript = transcribe_audio(file_stream, audio_format=audio_format)
md_content += "\n\n### Audio Transcript:\n" + ( if transcript:
"[No speech detected]" if transcript == "" else transcript md_content += "\n\n### Audio Transcript:\n" + transcript
) except MissingDependencyException:
except Exception: pass
md_content += (
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
)
# Return the result
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip())
def _transcribe_audio(self, local_path) -> str:
recognizer = sr.Recognizer()
with sr.AudioFile(local_path) as source:
audio = recognizer.record(source)
return recognizer.recognize_google(audio).strip()

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -15,6 +15,7 @@ from markitdown import (
FileConversionException, FileConversionException,
StreamInfo, StreamInfo,
) )
from markitdown._stream_info import _guess_stream_info_from_stream
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
@ -41,6 +42,13 @@ JPG_TEST_EXIFTOOL = {
"DateTimeOriginal": "2024:03:14 22:10:00", "DateTimeOriginal": "2024:03:14 22:10:00",
} }
MP3_TEST_EXIFTOOL = {
"Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e",
"Artist": "Artist Name Test String",
"Album": "Album Name Test String",
"SampleRate": "48000",
}
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
PDF_TEST_STRINGS = [ PDF_TEST_STRINGS = [
"While there is contemporaneous exploration of multi-agent approaches" "While there is contemporaneous exploration of multi-agent approaches"
@ -261,7 +269,7 @@ def test_stream_info_guesses() -> None:
for file_path, expected_mimetype in test_tuples: for file_path, expected_mimetype in test_tuples:
with open(file_path, "rb") as f: with open(file_path, "rb") as f:
guesses = StreamInfo.guess_from_stream( guesses = _guess_stream_info_from_stream(
f, filename_hint=os.path.basename(file_path) f, filename_hint=os.path.basename(file_path)
) )
assert len(guesses) > 0 assert len(guesses) > 0
@ -389,6 +397,26 @@ def test_markitdown_local() -> None:
assert "# Test" in result.text_content assert "# Test" in result.text_content
@pytest.mark.skipif(
skip_remote,
reason="do not run remotely run speech transcription tests",
)
def test_speech_transcription() -> None:
markitdown = MarkItDown()
# Test WAV files, MP3 and M4A files
for file_name in ["test.wav", "test.mp3", "test.m4a"]:
result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name))
result_lower = result.text_content.lower()
assert (
("1" in result_lower or "one" in result_lower)
and ("2" in result_lower or "two" in result_lower)
and ("3" in result_lower or "three" in result_lower)
and ("4" in result_lower or "four" in result_lower)
and ("5" in result_lower or "five" in result_lower)
)
def test_exceptions() -> None: def test_exceptions() -> None:
# Check that an exception is raised when trying to convert an unsupported format # Check that an exception is raised when trying to convert an unsupported format
markitdown = MarkItDown() markitdown = MarkItDown()
@ -437,6 +465,12 @@ def test_markitdown_exiftool() -> None:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content assert target in result.text_content
# Test some other media types
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3"))
for key in MP3_TEST_EXIFTOOL:
target = f"{key}: {MP3_TEST_EXIFTOOL[key]}"
assert target in result.text_content
@pytest.mark.skipif( @pytest.mark.skipif(
skip_llm, skip_llm,
@ -470,6 +504,7 @@ if __name__ == "__main__":
test_stream_info_guesses() test_stream_info_guesses()
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()
test_speech_transcription()
test_exceptions() test_exceptions()
test_markitdown_exiftool() test_markitdown_exiftool()
test_markitdown_llm() test_markitdown_llm()