Fixed exif warning test.
This commit is contained in:
parent
a9ceb13feb
commit
736e0ae332
8 changed files with 13 additions and 146 deletions
|
|
@ -33,8 +33,7 @@ from .converters import (
|
||||||
XlsConverter,
|
XlsConverter,
|
||||||
PptxConverter,
|
PptxConverter,
|
||||||
ImageConverter,
|
ImageConverter,
|
||||||
WavConverter,
|
AudioConverter,
|
||||||
Mp3Converter,
|
|
||||||
OutlookMsgConverter,
|
OutlookMsgConverter,
|
||||||
ZipConverter,
|
ZipConverter,
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
|
|
@ -140,8 +139,7 @@ class MarkItDown:
|
||||||
self.register_converter(XlsxConverter())
|
self.register_converter(XlsxConverter())
|
||||||
self.register_converter(XlsConverter())
|
self.register_converter(XlsConverter())
|
||||||
self.register_converter(PptxConverter())
|
self.register_converter(PptxConverter())
|
||||||
self.register_converter(WavConverter())
|
self.register_converter(AudioConverter())
|
||||||
self.register_converter(Mp3Converter())
|
|
||||||
self.register_converter(ImageConverter())
|
self.register_converter(ImageConverter())
|
||||||
self.register_converter(IpynbConverter())
|
self.register_converter(IpynbConverter())
|
||||||
self.register_converter(PdfConverter())
|
self.register_converter(PdfConverter())
|
||||||
|
|
|
||||||
|
|
@ -14,8 +14,7 @@ from ._docx_converter import DocxConverter
|
||||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
from ._image_converter import ImageConverter
|
from ._image_converter import ImageConverter
|
||||||
from ._wav_converter import WavConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._mp3_converter import Mp3Converter
|
|
||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
|
@ -34,8 +33,7 @@ __all__ = [
|
||||||
"XlsConverter",
|
"XlsConverter",
|
||||||
"PptxConverter",
|
"PptxConverter",
|
||||||
"ImageConverter",
|
"ImageConverter",
|
||||||
"WavConverter",
|
"AudioConverter",
|
||||||
"Mp3Converter",
|
|
||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
|
|
||||||
|
|
@ -21,9 +21,9 @@ ACCEPTED_FILE_EXTENSIONS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class WavConverter(DocumentConverter):
|
class AudioConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -37,10 +37,6 @@ class WavConverter(DocumentConverter):
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
|
||||||
Make sure we're dealing with HTML content *from* Wikipedia.
|
|
||||||
"""
|
|
||||||
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
|
@ -4,7 +4,7 @@ import locale
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import os
|
import os
|
||||||
from warnings import warn
|
import warnings
|
||||||
from typing import BinaryIO, Literal, Optional
|
from typing import BinaryIO, Literal, Optional
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -15,7 +15,7 @@ def exiftool_metadata(
|
||||||
if not exiftool_path:
|
if not exiftool_path:
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
if which_exiftool:
|
if which_exiftool:
|
||||||
warn(
|
warnings.warn(
|
||||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||||
|
|
||||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||||
|
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
||||||
import subprocess
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
from warnings import warn
|
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter
|
|
||||||
|
|
||||||
|
|
||||||
class MediaConverter(DocumentConverter):
|
|
||||||
"""
|
|
||||||
Abstract class for multi-modal media (e.g., images and audio)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def _get_metadata(self, local_path, exiftool_path=None):
|
|
||||||
if not exiftool_path:
|
|
||||||
which_exiftool = shutil.which("exiftool")
|
|
||||||
if which_exiftool:
|
|
||||||
warn(
|
|
||||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
|
||||||
|
|
||||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
|
||||||
|
|
||||||
This warning will be removed in future releases.
|
|
||||||
""",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
if True:
|
|
||||||
result = subprocess.run(
|
|
||||||
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
|
||||||
).stdout
|
|
||||||
return json.loads(result)[0]
|
|
||||||
# except Exception:
|
|
||||||
# return None
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
import tempfile
|
|
||||||
from typing import Union
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
||||||
from warnings import resetwarnings, catch_warnings
|
|
||||||
|
|
||||||
# Optional Transcription support
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
|
||||||
try:
|
|
||||||
# Using warnings' catch_warnings to catch
|
|
||||||
# pydub's warning of ffmpeg or avconv missing
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
import pydub
|
|
||||||
|
|
||||||
if w:
|
|
||||||
raise ModuleNotFoundError
|
|
||||||
import speech_recognition as sr
|
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
|
|
||||||
class Mp3Converter(DocumentConverter):
|
|
||||||
"""
|
|
||||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
# def __init__(
|
|
||||||
# self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
# ):
|
|
||||||
# super().__init__(priority=priority)
|
|
||||||
#
|
|
||||||
# def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
||||||
# # Bail if not a MP3
|
|
||||||
# extension = kwargs.get("file_extension", "")
|
|
||||||
# if extension.lower() != ".mp3":
|
|
||||||
# return None
|
|
||||||
#
|
|
||||||
# md_content = ""
|
|
||||||
#
|
|
||||||
# # Add metadata
|
|
||||||
# metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
|
||||||
# if metadata:
|
|
||||||
# for f in [
|
|
||||||
# "Title",
|
|
||||||
# "Artist",
|
|
||||||
# "Author",
|
|
||||||
# "Band",
|
|
||||||
# "Album",
|
|
||||||
# "Genre",
|
|
||||||
# "Track",
|
|
||||||
# "DateTimeOriginal",
|
|
||||||
# "CreateDate",
|
|
||||||
# "Duration",
|
|
||||||
# ]:
|
|
||||||
# if f in metadata:
|
|
||||||
# md_content += f"{f}: {metadata[f]}\n"
|
|
||||||
#
|
|
||||||
# # Transcribe
|
|
||||||
# if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
|
||||||
# handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
|
||||||
# os.close(handle)
|
|
||||||
# try:
|
|
||||||
# sound = pydub.AudioSegment.from_mp3(local_path)
|
|
||||||
# sound.export(temp_path, format="wav")
|
|
||||||
#
|
|
||||||
# _args = dict()
|
|
||||||
# _args.update(kwargs)
|
|
||||||
# _args["file_extension"] = ".wav"
|
|
||||||
#
|
|
||||||
# try:
|
|
||||||
# transcript = super()._transcribe_audio(temp_path).strip()
|
|
||||||
# md_content += "\n\n### Audio Transcript:\n" + (
|
|
||||||
# "[No speech detected]" if transcript == "" else transcript
|
|
||||||
# )
|
|
||||||
# except Exception:
|
|
||||||
# md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
||||||
#
|
|
||||||
# finally:
|
|
||||||
# os.unlink(temp_path)
|
|
||||||
#
|
|
||||||
# # Return the result
|
|
||||||
# return DocumentConverterResult(markdown=md_content.strip())
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import io
|
import io
|
||||||
|
import sys
|
||||||
from typing import BinaryIO
|
from typing import BinaryIO
|
||||||
from .._exceptions import MissingDependencyException
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import openai
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from warnings import catch_warnings, resetwarnings
|
import warnings
|
||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
|
|
@ -440,14 +440,15 @@ def test_markitdown_exiftool() -> None:
|
||||||
# Test the automatic discovery of exiftool throws a warning
|
# Test the automatic discovery of exiftool throws a warning
|
||||||
# and is disabled
|
# and is disabled
|
||||||
try:
|
try:
|
||||||
with catch_warnings(record=True) as w:
|
warnings.simplefilter("default")
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
assert len(w) == 1
|
assert len(w) == 1
|
||||||
assert w[0].category is DeprecationWarning
|
assert w[0].category is DeprecationWarning
|
||||||
assert result.text_content.strip() == ""
|
assert result.text_content.strip() == ""
|
||||||
finally:
|
finally:
|
||||||
resetwarnings()
|
warnings.resetwarnings()
|
||||||
|
|
||||||
# Test explicitly setting the location of exiftool
|
# Test explicitly setting the location of exiftool
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue