Stream exiftool.

This commit is contained in:
Adam Fourney 2025-03-04 17:18:54 -08:00
parent 7879028c98
commit 4a034da269
2 changed files with 127 additions and 53 deletions

View file

@ -0,0 +1,44 @@
import json
import subprocess
import locale
import sys
import shutil
import os
from warnings import warn
from typing import BinaryIO, Literal, Optional
def exiftool_metadata(
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
) -> dict[str, Literal]:
# Check if we have a valid pointer to exiftool
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
# Nothing to do
return {}
# Run exiftool
cur_pos = file_stream.tell()
try:
output = subprocess.run(
[exiftool_path, "-json", "-"],
input=file_stream.read(),
capture_output=True,
text=False,
).stdout
return json.loads(
output.decode(locale.getpreferredencoding(False)),
)[0]
finally:
file_stream.seek(cur_pos)

View file

@ -1,11 +1,20 @@
from typing import Union
from .._base_converter import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
from typing import BinaryIO, Any
import base64
import mimetypes
from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
ACCEPTED_MIME_TYPE_PREFIXES = [
"image/jpeg",
"image/png",
]
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
class ImageConverter(MediaConverter):
class ImageConverter(DocumentConverter):
"""
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""
@ -15,16 +24,36 @@ class ImageConverter(MediaConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
md_content = ""
# Add metadata
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
metadata = exiftool_metadata(
file_stream, exiftool_path=kwargs.get("exiftool_path")
)
if metadata:
for f in [
@ -42,52 +71,53 @@ class ImageConverter(MediaConverter):
if f in metadata:
md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPTV
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
md_content += (
"\n# Description:\n"
+ self._get_llm_description(
local_path,
extension,
llm_client,
llm_model,
prompt=kwargs.get("llm_prompt"),
).strip()
+ "\n"
)
# # Try describing the image with GPTV
# llm_client = kwargs.get("llm_client")
# llm_model = kwargs.get("llm_model")
# if llm_client is not None and llm_model is not None:
# md_content += (
# "\n# Description:\n"
# + self._get_llm_description(
# local_path,
# extension,
# llm_client,
# llm_model,
# prompt=kwargs.get("llm_prompt"),
# ).strip()
# + "\n"
# )
return DocumentConverterResult(
markdown=md_content,
)
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
data_uri = ""
with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
],
}
]
response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content
# def _get_llm_description(self, local_path, extension, client, model, prompt=None):
# if prompt is None or prompt.strip() == "":
# prompt = "Write a detailed caption for this image."
#
# data_uri = ""
# with open(local_path, "rb") as image_file:
# content_type, encoding = mimetypes.guess_type("_dummy" + extension)
# if content_type is None:
# content_type = "image/jpeg"
# image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
# data_uri = f"data:{content_type};base64,{image_base64}"
#
# messages = [
# {
# "role": "user",
# "content": [
# {"type": "text", "text": prompt},
# {
# "type": "image_url",
# "image_url": {
# "url": data_uri,
# },
# },
# ],
# }
# ]
#
# response = client.chat.completions.create(model=model, messages=messages)
# return response.choices[0].message.content