diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index c4166b0..104db11 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -33,8 +33,7 @@ from .converters import ( XlsConverter, PptxConverter, ImageConverter, - WavConverter, - Mp3Converter, + AudioConverter, OutlookMsgConverter, ZipConverter, DocumentIntelligenceConverter, @@ -140,8 +139,7 @@ class MarkItDown: self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) - self.register_converter(WavConverter()) - self.register_converter(Mp3Converter()) + self.register_converter(AudioConverter()) self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 038038d..f43efe3 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -14,8 +14,7 @@ from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter -from ._wav_converter import WavConverter -from ._mp3_converter import Mp3Converter +from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import DocumentIntelligenceConverter @@ -34,8 +33,7 @@ __all__ = [ "XlsConverter", "PptxConverter", "ImageConverter", - "WavConverter", - "Mp3Converter", + "AudioConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_audio_converter.py similarity index 91% rename from packages/markitdown/src/markitdown/converters/_wav_converter.py rename to packages/markitdown/src/markitdown/converters/_audio_converter.py index 31eeed1..d502deb 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py @@ -21,9 +21,9 @@ ACCEPTED_FILE_EXTENSIONS = [ ] -class WavConverter(DocumentConverter): +class AudioConverter(DocumentConverter): """ - Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). + Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ def __init__( @@ -37,10 +37,6 @@ class WavConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: - """ - Make sure we're dealing with HTML content *from* Wikipedia. - """ - mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() diff --git a/packages/markitdown/src/markitdown/converters/_exiftool.py b/packages/markitdown/src/markitdown/converters/_exiftool.py index 779301b..b492801 100644 --- a/packages/markitdown/src/markitdown/converters/_exiftool.py +++ b/packages/markitdown/src/markitdown/converters/_exiftool.py @@ -4,7 +4,7 @@ import locale import sys import shutil import os -from warnings import warn +import warnings from typing import BinaryIO, Literal, Optional @@ -15,7 +15,7 @@ def exiftool_metadata( if not exiftool_path: which_exiftool = shutil.which("exiftool") if which_exiftool: - warn( + warnings.warn( f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., md = MarkItDown(exiftool_path="{which_exiftool}") diff --git a/packages/markitdown/src/markitdown/converters/_media_converter.py b/packages/markitdown/src/markitdown/converters/_media_converter.py deleted file mode 100644 index 0a5cebf..0000000 --- a/packages/markitdown/src/markitdown/converters/_media_converter.py +++ /dev/null @@ -1,41 +0,0 @@ -import subprocess -import shutil -import json -from warnings import warn - -from .._base_converter import DocumentConverter - - -class MediaConverter(DocumentConverter): - """ - Abstract class for multi-modal media (e.g., images and audio) - """ - - def __init__( - self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT - ): - super().__init__(priority=priority) - - def _get_metadata(self, local_path, exiftool_path=None): - if not exiftool_path: - which_exiftool = shutil.which("exiftool") - if which_exiftool: - warn( - f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., - - md = MarkItDown(exiftool_path="{which_exiftool}") - -This warning will be removed in future releases. -""", - DeprecationWarning, - ) - - return None - else: - if True: - result = subprocess.run( - [exiftool_path, "-json", local_path], capture_output=True, text=True - ).stdout - return json.loads(result)[0] - # except Exception: - # return None diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py deleted file mode 100644 index 5ff5e88..0000000 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ /dev/null @@ -1,86 +0,0 @@ -import tempfile -from typing import Union -from .._base_converter import DocumentConverter, DocumentConverterResult -from warnings import resetwarnings, catch_warnings - -# Optional Transcription support -IS_AUDIO_TRANSCRIPTION_CAPABLE = False -try: - # Using warnings' catch_warnings to catch - # pydub's warning of ffmpeg or avconv missing - with catch_warnings(record=True) as w: - import pydub - - if w: - raise ModuleNotFoundError - import speech_recognition as sr - - IS_AUDIO_TRANSCRIPTION_CAPABLE = True -except ModuleNotFoundError: - pass -finally: - resetwarnings() - - -class Mp3Converter(DocumentConverter): - """ - Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). - """ - - -# def __init__( -# self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT -# ): -# super().__init__(priority=priority) -# -# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: -# # Bail if not a MP3 -# extension = kwargs.get("file_extension", "") -# if extension.lower() != ".mp3": -# return None -# -# md_content = "" -# -# # Add metadata -# metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) -# if metadata: -# for f in [ -# "Title", -# "Artist", -# "Author", -# "Band", -# "Album", -# "Genre", -# "Track", -# "DateTimeOriginal", -# "CreateDate", -# "Duration", -# ]: -# if f in metadata: -# md_content += f"{f}: {metadata[f]}\n" -# -# # Transcribe -# if IS_AUDIO_TRANSCRIPTION_CAPABLE: -# handle, temp_path = tempfile.mkstemp(suffix=".wav") -# os.close(handle) -# try: -# sound = pydub.AudioSegment.from_mp3(local_path) -# sound.export(temp_path, format="wav") -# -# _args = dict() -# _args.update(kwargs) -# _args["file_extension"] = ".wav" -# -# try: -# transcript = super()._transcribe_audio(temp_path).strip() -# md_content += "\n\n### Audio Transcript:\n" + ( -# "[No speech detected]" if transcript == "" else transcript -# ) -# except Exception: -# md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." -# -# finally: -# os.unlink(temp_path) -# -# # Return the result -# return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py index da63336..cd212ba 100644 --- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py +++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py @@ -1,4 +1,5 @@ import io +import sys from typing import BinaryIO from .._exceptions import MissingDependencyException diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index 88b0bd6..6a11824 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -7,7 +7,7 @@ import openai import pytest import requests -from warnings import catch_warnings, resetwarnings +import warnings from markitdown import ( MarkItDown, @@ -440,14 +440,15 @@ def test_markitdown_exiftool() -> None: # Test the automatic discovery of exiftool throws a warning # and is disabled try: - with catch_warnings(record=True) as w: + warnings.simplefilter("default") + with warnings.catch_warnings(record=True) as w: markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) assert len(w) == 1 assert w[0].category is DeprecationWarning assert result.text_content.strip() == "" finally: - resetwarnings() + warnings.resetwarnings() # Test explicitly setting the location of exiftool which_exiftool = shutil.which("exiftool")