Fixed exif warning test.
This commit is contained in:
parent
a9ceb13feb
commit
736e0ae332
8 changed files with 13 additions and 146 deletions
|
|
@ -33,8 +33,7 @@ from .converters import (
|
|||
XlsConverter,
|
||||
PptxConverter,
|
||||
ImageConverter,
|
||||
WavConverter,
|
||||
Mp3Converter,
|
||||
AudioConverter,
|
||||
OutlookMsgConverter,
|
||||
ZipConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
|
|
@ -140,8 +139,7 @@ class MarkItDown:
|
|||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
self.register_converter(WavConverter())
|
||||
self.register_converter(Mp3Converter())
|
||||
self.register_converter(AudioConverter())
|
||||
self.register_converter(ImageConverter())
|
||||
self.register_converter(IpynbConverter())
|
||||
self.register_converter(PdfConverter())
|
||||
|
|
|
|||
|
|
@ -14,8 +14,7 @@ from ._docx_converter import DocxConverter
|
|||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._wav_converter import WavConverter
|
||||
from ._mp3_converter import Mp3Converter
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._outlook_msg_converter import OutlookMsgConverter
|
||||
from ._zip_converter import ZipConverter
|
||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||
|
|
@ -34,8 +33,7 @@ __all__ = [
|
|||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
"ImageConverter",
|
||||
"WavConverter",
|
||||
"Mp3Converter",
|
||||
"AudioConverter",
|
||||
"OutlookMsgConverter",
|
||||
"ZipConverter",
|
||||
"DocumentIntelligenceConverter",
|
||||
|
|
|
|||
|
|
@ -21,9 +21,9 @@ ACCEPTED_FILE_EXTENSIONS = [
|
|||
]
|
||||
|
||||
|
||||
class WavConverter(DocumentConverter):
|
||||
class AudioConverter(DocumentConverter):
|
||||
"""
|
||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -37,10 +37,6 @@ class WavConverter(DocumentConverter):
|
|||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* Wikipedia.
|
||||
"""
|
||||
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
|
|
@ -4,7 +4,7 @@ import locale
|
|||
import sys
|
||||
import shutil
|
||||
import os
|
||||
from warnings import warn
|
||||
import warnings
|
||||
from typing import BinaryIO, Literal, Optional
|
||||
|
||||
|
||||
|
|
@ -15,7 +15,7 @@ def exiftool_metadata(
|
|||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
if which_exiftool:
|
||||
warn(
|
||||
warnings.warn(
|
||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||
|
||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||
|
|
|
|||
|
|
@ -1,41 +0,0 @@
|
|||
import subprocess
|
||||
import shutil
|
||||
import json
|
||||
from warnings import warn
|
||||
|
||||
from .._base_converter import DocumentConverter
|
||||
|
||||
|
||||
class MediaConverter(DocumentConverter):
|
||||
"""
|
||||
Abstract class for multi-modal media (e.g., images and audio)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def _get_metadata(self, local_path, exiftool_path=None):
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
if which_exiftool:
|
||||
warn(
|
||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||
|
||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||
|
||||
This warning will be removed in future releases.
|
||||
""",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
return None
|
||||
else:
|
||||
if True:
|
||||
result = subprocess.run(
|
||||
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
||||
).stdout
|
||||
return json.loads(result)[0]
|
||||
# except Exception:
|
||||
# return None
|
||||
|
|
@ -1,86 +0,0 @@
|
|||
import tempfile
|
||||
from typing import Union
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
try:
|
||||
# Using warnings' catch_warnings to catch
|
||||
# pydub's warning of ffmpeg or avconv missing
|
||||
with catch_warnings(record=True) as w:
|
||||
import pydub
|
||||
|
||||
if w:
|
||||
raise ModuleNotFoundError
|
||||
import speech_recognition as sr
|
||||
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
finally:
|
||||
resetwarnings()
|
||||
|
||||
|
||||
class Mp3Converter(DocumentConverter):
|
||||
"""
|
||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
||||
"""
|
||||
|
||||
|
||||
# def __init__(
|
||||
# self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
# ):
|
||||
# super().__init__(priority=priority)
|
||||
#
|
||||
# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# # Bail if not a MP3
|
||||
# extension = kwargs.get("file_extension", "")
|
||||
# if extension.lower() != ".mp3":
|
||||
# return None
|
||||
#
|
||||
# md_content = ""
|
||||
#
|
||||
# # Add metadata
|
||||
# metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||
# if metadata:
|
||||
# for f in [
|
||||
# "Title",
|
||||
# "Artist",
|
||||
# "Author",
|
||||
# "Band",
|
||||
# "Album",
|
||||
# "Genre",
|
||||
# "Track",
|
||||
# "DateTimeOriginal",
|
||||
# "CreateDate",
|
||||
# "Duration",
|
||||
# ]:
|
||||
# if f in metadata:
|
||||
# md_content += f"{f}: {metadata[f]}\n"
|
||||
#
|
||||
# # Transcribe
|
||||
# if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
||||
# handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
||||
# os.close(handle)
|
||||
# try:
|
||||
# sound = pydub.AudioSegment.from_mp3(local_path)
|
||||
# sound.export(temp_path, format="wav")
|
||||
#
|
||||
# _args = dict()
|
||||
# _args.update(kwargs)
|
||||
# _args["file_extension"] = ".wav"
|
||||
#
|
||||
# try:
|
||||
# transcript = super()._transcribe_audio(temp_path).strip()
|
||||
# md_content += "\n\n### Audio Transcript:\n" + (
|
||||
# "[No speech detected]" if transcript == "" else transcript
|
||||
# )
|
||||
# except Exception:
|
||||
# md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||
#
|
||||
# finally:
|
||||
# os.unlink(temp_path)
|
||||
#
|
||||
# # Return the result
|
||||
# return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
import io
|
||||
import sys
|
||||
from typing import BinaryIO
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import openai
|
|||
import pytest
|
||||
import requests
|
||||
|
||||
from warnings import catch_warnings, resetwarnings
|
||||
import warnings
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
|
|
@ -440,14 +440,15 @@ def test_markitdown_exiftool() -> None:
|
|||
# Test the automatic discovery of exiftool throws a warning
|
||||
# and is disabled
|
||||
try:
|
||||
with catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("default")
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
assert len(w) == 1
|
||||
assert w[0].category is DeprecationWarning
|
||||
assert result.text_content.strip() == ""
|
||||
finally:
|
||||
resetwarnings()
|
||||
warnings.resetwarnings()
|
||||
|
||||
# Test explicitly setting the location of exiftool
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
|
|
|
|||
Loading…
Reference in a new issue