Fixed exif warning test.

This commit is contained in:
Adam Fourney 2025-03-05 10:39:29 -08:00
parent a9ceb13feb
commit 736e0ae332
8 changed files with 13 additions and 146 deletions

View file

@ -33,8 +33,7 @@ from .converters import (
XlsConverter, XlsConverter,
PptxConverter, PptxConverter,
ImageConverter, ImageConverter,
WavConverter, AudioConverter,
Mp3Converter,
OutlookMsgConverter, OutlookMsgConverter,
ZipConverter, ZipConverter,
DocumentIntelligenceConverter, DocumentIntelligenceConverter,
@ -140,8 +139,7 @@ class MarkItDown:
self.register_converter(XlsxConverter()) self.register_converter(XlsxConverter())
self.register_converter(XlsConverter()) self.register_converter(XlsConverter())
self.register_converter(PptxConverter()) self.register_converter(PptxConverter())
self.register_converter(WavConverter()) self.register_converter(AudioConverter())
self.register_converter(Mp3Converter())
self.register_converter(ImageConverter()) self.register_converter(ImageConverter())
self.register_converter(IpynbConverter()) self.register_converter(IpynbConverter())
self.register_converter(PdfConverter()) self.register_converter(PdfConverter())

View file

@ -14,8 +14,7 @@ from ._docx_converter import DocxConverter
from ._xlsx_converter import XlsxConverter, XlsConverter from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter from ._image_converter import ImageConverter
from ._wav_converter import WavConverter from ._audio_converter import AudioConverter
from ._mp3_converter import Mp3Converter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import DocumentIntelligenceConverter
@ -34,8 +33,7 @@ __all__ = [
"XlsConverter", "XlsConverter",
"PptxConverter", "PptxConverter",
"ImageConverter", "ImageConverter",
"WavConverter", "AudioConverter",
"Mp3Converter",
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",

View file

@ -21,9 +21,9 @@ ACCEPTED_FILE_EXTENSIONS = [
] ]
class WavConverter(DocumentConverter): class AudioConverter(DocumentConverter):
""" """
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
""" """
def __init__( def __init__(
@ -37,10 +37,6 @@ class WavConverter(DocumentConverter):
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> bool: ) -> bool:
"""
Make sure we're dealing with HTML content *from* Wikipedia.
"""
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()

View file

@ -4,7 +4,7 @@ import locale
import sys import sys
import shutil import shutil
import os import os
from warnings import warn import warnings
from typing import BinaryIO, Literal, Optional from typing import BinaryIO, Literal, Optional
@ -15,7 +15,7 @@ def exiftool_metadata(
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
if which_exiftool: if which_exiftool:
warn( warnings.warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}") md = MarkItDown(exiftool_path="{which_exiftool}")

View file

@ -1,41 +0,0 @@
import subprocess
import shutil
import json
from warnings import warn
from .._base_converter import DocumentConverter
class MediaConverter(DocumentConverter):
"""
Abstract class for multi-modal media (e.g., images and audio)
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None
else:
if True:
result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
# except Exception:
# return None

View file

@ -1,86 +0,0 @@
import tempfile
from typing import Union
from .._base_converter import DocumentConverter, DocumentConverterResult
from warnings import resetwarnings, catch_warnings
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError:
pass
finally:
resetwarnings()
class Mp3Converter(DocumentConverter):
"""
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""
# def __init__(
# self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
# ):
# super().__init__(priority=priority)
#
# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# # Bail if not a MP3
# extension = kwargs.get("file_extension", "")
# if extension.lower() != ".mp3":
# return None
#
# md_content = ""
#
# # Add metadata
# metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
# if metadata:
# for f in [
# "Title",
# "Artist",
# "Author",
# "Band",
# "Album",
# "Genre",
# "Track",
# "DateTimeOriginal",
# "CreateDate",
# "Duration",
# ]:
# if f in metadata:
# md_content += f"{f}: {metadata[f]}\n"
#
# # Transcribe
# if IS_AUDIO_TRANSCRIPTION_CAPABLE:
# handle, temp_path = tempfile.mkstemp(suffix=".wav")
# os.close(handle)
# try:
# sound = pydub.AudioSegment.from_mp3(local_path)
# sound.export(temp_path, format="wav")
#
# _args = dict()
# _args.update(kwargs)
# _args["file_extension"] = ".wav"
#
# try:
# transcript = super()._transcribe_audio(temp_path).strip()
# md_content += "\n\n### Audio Transcript:\n" + (
# "[No speech detected]" if transcript == "" else transcript
# )
# except Exception:
# md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
#
# finally:
# os.unlink(temp_path)
#
# # Return the result
# return DocumentConverterResult(markdown=md_content.strip())

View file

@ -1,4 +1,5 @@
import io import io
import sys
from typing import BinaryIO from typing import BinaryIO
from .._exceptions import MissingDependencyException from .._exceptions import MissingDependencyException

View file

@ -7,7 +7,7 @@ import openai
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings import warnings
from markitdown import ( from markitdown import (
MarkItDown, MarkItDown,
@ -440,14 +440,15 @@ def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning # Test the automatic discovery of exiftool throws a warning
# and is disabled # and is disabled
try: try:
with catch_warnings(record=True) as w: warnings.simplefilter("default")
with warnings.catch_warnings(record=True) as w:
markitdown = MarkItDown() markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1 assert len(w) == 1
assert w[0].category is DeprecationWarning assert w[0].category is DeprecationWarning
assert result.text_content.strip() == "" assert result.text_content.strip() == ""
finally: finally:
resetwarnings() warnings.resetwarnings()
# Test explicitly setting the location of exiftool # Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")