Fixed exif warning test.

This commit is contained in:
Adam Fourney 2025-03-05 10:39:29 -08:00
parent a9ceb13feb
commit 736e0ae332
8 changed files with 13 additions and 146 deletions

View file

@ -33,8 +33,7 @@ from .converters import (
XlsConverter,
PptxConverter,
ImageConverter,
WavConverter,
Mp3Converter,
AudioConverter,
OutlookMsgConverter,
ZipConverter,
DocumentIntelligenceConverter,
@ -140,8 +139,7 @@ class MarkItDown:
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
self.register_converter(WavConverter())
self.register_converter(Mp3Converter())
self.register_converter(AudioConverter())
self.register_converter(ImageConverter())
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())

View file

@ -14,8 +14,7 @@ from ._docx_converter import DocxConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
from ._wav_converter import WavConverter
from ._mp3_converter import Mp3Converter
from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter
@ -34,8 +33,7 @@ __all__ = [
"XlsConverter",
"PptxConverter",
"ImageConverter",
"WavConverter",
"Mp3Converter",
"AudioConverter",
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",

View file

@ -21,9 +21,9 @@ ACCEPTED_FILE_EXTENSIONS = [
]
class WavConverter(DocumentConverter):
class AudioConverter(DocumentConverter):
"""
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""
def __init__(
@ -37,10 +37,6 @@ class WavConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
"""
Make sure we're dealing with HTML content *from* Wikipedia.
"""
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

View file

@ -4,7 +4,7 @@ import locale
import sys
import shutil
import os
from warnings import warn
import warnings
from typing import BinaryIO, Literal, Optional
@ -15,7 +15,7 @@ def exiftool_metadata(
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
warnings.warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")

View file

@ -1,41 +0,0 @@
import subprocess
import shutil
import json
from warnings import warn
from .._base_converter import DocumentConverter
class MediaConverter(DocumentConverter):
"""
Abstract class for multi-modal media (e.g., images and audio)
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None
else:
if True:
result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
# except Exception:
# return None

View file

@ -1,86 +0,0 @@
import tempfile
from typing import Union
from .._base_converter import DocumentConverter, DocumentConverterResult
from warnings import resetwarnings, catch_warnings
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError:
pass
finally:
resetwarnings()
class Mp3Converter(DocumentConverter):
"""
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""
# def __init__(
# self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
# ):
# super().__init__(priority=priority)
#
# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# # Bail if not a MP3
# extension = kwargs.get("file_extension", "")
# if extension.lower() != ".mp3":
# return None
#
# md_content = ""
#
# # Add metadata
# metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
# if metadata:
# for f in [
# "Title",
# "Artist",
# "Author",
# "Band",
# "Album",
# "Genre",
# "Track",
# "DateTimeOriginal",
# "CreateDate",
# "Duration",
# ]:
# if f in metadata:
# md_content += f"{f}: {metadata[f]}\n"
#
# # Transcribe
# if IS_AUDIO_TRANSCRIPTION_CAPABLE:
# handle, temp_path = tempfile.mkstemp(suffix=".wav")
# os.close(handle)
# try:
# sound = pydub.AudioSegment.from_mp3(local_path)
# sound.export(temp_path, format="wav")
#
# _args = dict()
# _args.update(kwargs)
# _args["file_extension"] = ".wav"
#
# try:
# transcript = super()._transcribe_audio(temp_path).strip()
# md_content += "\n\n### Audio Transcript:\n" + (
# "[No speech detected]" if transcript == "" else transcript
# )
# except Exception:
# md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
#
# finally:
# os.unlink(temp_path)
#
# # Return the result
# return DocumentConverterResult(markdown=md_content.strip())

View file

@ -1,4 +1,5 @@
import io
import sys
from typing import BinaryIO
from .._exceptions import MissingDependencyException

View file

@ -7,7 +7,7 @@ import openai
import pytest
import requests
from warnings import catch_warnings, resetwarnings
import warnings
from markitdown import (
MarkItDown,
@ -440,14 +440,15 @@ def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
warnings.simplefilter("default")
with warnings.catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
warnings.resetwarnings()
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")