Merge branch 'main' into main

This commit is contained in:
Ayman Hamed Moustafa 2025-01-24 11:23:33 +02:00 committed by GitHub
commit 7b1088cc80
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 55 additions and 19 deletions

View file

@ -1,6 +1,3 @@
> [!IMPORTANT]
> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
# MarkItDown
[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)

View file

@ -893,14 +893,25 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""
def _get_metadata(self, local_path):
exiftool = shutil.which("exiftool")
if not exiftool:
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None
else:
try:
result = subprocess.run(
[exiftool, "-json", local_path], capture_output=True, text=True
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
@ -921,7 +932,7 @@ class WavConverter(MediaConverter):
md_content = ""
# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"Title",
@ -976,7 +987,7 @@ class Mp3Converter(WavConverter):
md_content = ""
# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"Title",
@ -1037,7 +1048,7 @@ class ImageConverter(MediaConverter):
md_content = ""
# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"ImageSize",
@ -1344,6 +1355,7 @@ class MarkItDown:
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
exiftool_path: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
@ -1353,6 +1365,9 @@ class MarkItDown:
else:
self._requests_session = requests_session
if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Handle deprecation notices
#############################
if mlm_client is not None:
@ -1385,6 +1400,7 @@ class MarkItDown:
self._llm_client = llm_client
self._llm_model = llm_model
self._style_map = style_map
self._exiftool_path = exiftool_path
self._page_converters: List[DocumentConverter] = []
@ -1568,12 +1584,15 @@ class MarkItDown:
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
# If we hit an error log it and keep trying
try:
res = converter.convert(local_path, **_kwargs)

View file

@ -277,9 +277,29 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
markitdown = MarkItDown()
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test JPG metadata processing
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool)
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content
# Test setting the exiftool path through an environment variable
os.environ["EXIFTOOL_PATH"] = which_exiftool
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
@ -341,8 +361,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
# test_markitdown_deprecation()
# test_markitdown_llm()