Fixed exif warning test.

2025-03-05 10:39:29 -08:00 · 2025-03-05 10:39:29 -08:00 · 736e0ae332
commit 736e0ae332
parent a9ceb13feb
8 changed files with 13 additions and 146 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -33,8 +33,7 @@ from .converters import (
    XlsConverter,
    PptxConverter,
    ImageConverter,
-    WavConverter,
-    Mp3Converter,
+    AudioConverter,
    OutlookMsgConverter,
    ZipConverter,
    DocumentIntelligenceConverter,
@ -140,8 +139,7 @@ class MarkItDown:
            self.register_converter(XlsxConverter())
            self.register_converter(XlsConverter())
            self.register_converter(PptxConverter())
-            self.register_converter(WavConverter())
-            self.register_converter(Mp3Converter())
+            self.register_converter(AudioConverter())
            self.register_converter(ImageConverter())
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -14,8 +14,7 @@ from ._docx_converter import DocxConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._image_converter import ImageConverter
-from ._wav_converter import WavConverter
-from ._mp3_converter import Mp3Converter
+from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
@ -34,8 +33,7 @@ __all__ = [
    "XlsConverter",
    "PptxConverter",
    "ImageConverter",
-    "WavConverter",
-    "Mp3Converter",
+    "AudioConverter",
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
--- a/packages/markitdown/src/markitdown/converters/_audio_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py
@ -21,9 +21,9 @@ ACCEPTED_FILE_EXTENSIONS = [
 ]


-class WavConverter(DocumentConverter):
+class AudioConverter(DocumentConverter):
    """
-    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
+    Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
    """

    def __init__(
@ -37,10 +37,6 @@ class WavConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
-        """
-        Make sure we're dealing with HTML content *from* Wikipedia.
-        """
-
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

--- a/packages/markitdown/src/markitdown/converters/_exiftool.py
+++ b/packages/markitdown/src/markitdown/converters/_exiftool.py
@ -4,7 +4,7 @@ import locale
 import sys
 import shutil
 import os
-from warnings import warn
+import warnings
 from typing import BinaryIO, Literal, Optional


@ -15,7 +15,7 @@ def exiftool_metadata(
    if not exiftool_path:
        which_exiftool = shutil.which("exiftool")
        if which_exiftool:
-            warn(
+            warnings.warn(
                f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 

    md = MarkItDown(exiftool_path="{which_exiftool}")
--- a/packages/markitdown/src/markitdown/converters/_media_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_media_converter.py
@ -1,41 +0,0 @@
-import subprocess
-import shutil
-import json
-from warnings import warn
-
-from .._base_converter import DocumentConverter
-
-
-class MediaConverter(DocumentConverter):
-    """
-    Abstract class for multi-modal media (e.g., images and audio)
-    """
-
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
-    def _get_metadata(self, local_path, exiftool_path=None):
-        if not exiftool_path:
-            which_exiftool = shutil.which("exiftool")
-            if which_exiftool:
-                warn(
-                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
-
-    md = MarkItDown(exiftool_path="{which_exiftool}")
-
-This warning will be removed in future releases.
-""",
-                    DeprecationWarning,
-                )
-
-            return None
-        else:
-            if True:
-                result = subprocess.run(
-                    [exiftool_path, "-json", local_path], capture_output=True, text=True
-                ).stdout
-                return json.loads(result)[0]
-            # except Exception:
-            #    return None
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,86 +0,0 @@
-import tempfile
-from typing import Union
-from .._base_converter import DocumentConverter, DocumentConverterResult
-from warnings import resetwarnings, catch_warnings
-
-# Optional Transcription support
-IS_AUDIO_TRANSCRIPTION_CAPABLE = False
-try:
-    # Using warnings' catch_warnings to catch
-    # pydub's warning of ffmpeg or avconv missing
-    with catch_warnings(record=True) as w:
-        import pydub
-
-        if w:
-            raise ModuleNotFoundError
-    import speech_recognition as sr
-
-    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-finally:
-    resetwarnings()
-
-
-class Mp3Converter(DocumentConverter):
-    """
-    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
-    """
-
-
-#    def __init__(
-#        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-#    ):
-#        super().__init__(priority=priority)
-#
-#    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-#        # Bail if not a MP3
-#        extension = kwargs.get("file_extension", "")
-#        if extension.lower() != ".mp3":
-#            return None
-#
-#        md_content = ""
-#
-#        # Add metadata
-#        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
-#        if metadata:
-#            for f in [
-#                "Title",
-#                "Artist",
-#                "Author",
-#                "Band",
-#                "Album",
-#                "Genre",
-#                "Track",
-#                "DateTimeOriginal",
-#                "CreateDate",
-#                "Duration",
-#            ]:
-#                if f in metadata:
-#                    md_content += f"{f}: {metadata[f]}\n"
-#
-#        # Transcribe
-#        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
-#            handle, temp_path = tempfile.mkstemp(suffix=".wav")
-#            os.close(handle)
-#            try:
-#                sound = pydub.AudioSegment.from_mp3(local_path)
-#                sound.export(temp_path, format="wav")
-#
-#                _args = dict()
-#                _args.update(kwargs)
-#                _args["file_extension"] = ".wav"
-#
-#                try:
-#                    transcript = super()._transcribe_audio(temp_path).strip()
-#                    md_content += "\n\n### Audio Transcript:\n" + (
-#                        "[No speech detected]" if transcript == "" else transcript
-#                    )
-#                except Exception:
-#                    md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
-#
-#            finally:
-#                os.unlink(temp_path)
-#
-#        # Return the result
-#        return DocumentConverterResult(markdown=md_content.strip())
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@ -1,4 +1,5 @@
 import io
+import sys
 from typing import BinaryIO
 from .._exceptions import MissingDependencyException

--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -7,7 +7,7 @@ import openai
 import pytest
 import requests

-from warnings import catch_warnings, resetwarnings
+import warnings

 from markitdown import (
    MarkItDown,
@ -440,14 +440,15 @@ def test_markitdown_exiftool() -> None:
    # Test the automatic discovery of exiftool throws a warning
    # and is disabled
    try:
-        with catch_warnings(record=True) as w:
+        warnings.simplefilter("default")
+        with warnings.catch_warnings(record=True) as w:
            markitdown = MarkItDown()
            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert result.text_content.strip() == ""
    finally:
-        resetwarnings()
+        warnings.resetwarnings()

    # Test explicitly setting the location of exiftool
    which_exiftool = shutil.which("exiftool")