Stream exiftool.

2025-03-04 17:18:54 -08:00 · 2025-03-04 17:18:54 -08:00 · 4a034da269
commit 4a034da269
parent 7879028c98
2 changed files with 127 additions and 53 deletions
--- a/packages/markitdown/src/markitdown/converters/_exiftool.py
+++ b/packages/markitdown/src/markitdown/converters/_exiftool.py
@ -0,0 +1,44 @@
+import json
+import subprocess
+import locale
+import sys
+import shutil
+import os
+from warnings import warn
+from typing import BinaryIO, Literal, Optional
+
+
+def exiftool_metadata(
+    file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
+) -> dict[str, Literal]:
+    # Check if we have a valid pointer to exiftool
+    if not exiftool_path:
+        which_exiftool = shutil.which("exiftool")
+        if which_exiftool:
+            warn(
+                f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
+
+    md = MarkItDown(exiftool_path="{which_exiftool}")
+
+This warning will be removed in future releases.
+""",
+                DeprecationWarning,
+            )
+        # Nothing to do
+        return {}
+
+    # Run exiftool
+    cur_pos = file_stream.tell()
+    try:
+        output = subprocess.run(
+            [exiftool_path, "-json", "-"],
+            input=file_stream.read(),
+            capture_output=True,
+            text=False,
+        ).stdout
+
+        return json.loads(
+            output.decode(locale.getpreferredencoding(False)),
+        )[0]
+    finally:
+        file_stream.seek(cur_pos)
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,11 +1,20 @@
-from typing import Union
-from .._base_converter import DocumentConverter, DocumentConverterResult
-from ._media_converter import MediaConverter
+from typing import BinaryIO, Any
 import base64
 import mimetypes
+from ._exiftool import exiftool_metadata
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "image/jpeg",
+    "image/png",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]


-class ImageConverter(MediaConverter):
+class ImageConverter(DocumentConverter):
    """
    Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """
@ -15,16 +24,36 @@ class ImageConverter(MediaConverter):
    ):
        super().__init__(priority=priority)

-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not an image
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()

+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+        metadata = exiftool_metadata(
+            file_stream, exiftool_path=kwargs.get("exiftool_path")
+        )

        if metadata:
            for f in [
@ -42,52 +71,53 @@ class ImageConverter(MediaConverter):
                if f in metadata:
                    md_content += f"{f}: {metadata[f]}\n"

-        # Try describing the image with GPTV
-        llm_client = kwargs.get("llm_client")
-        llm_model = kwargs.get("llm_model")
-        if llm_client is not None and llm_model is not None:
-            md_content += (
-                "\n# Description:\n"
-                + self._get_llm_description(
-                    local_path,
-                    extension,
-                    llm_client,
-                    llm_model,
-                    prompt=kwargs.get("llm_prompt"),
-                ).strip()
-                + "\n"
-            )
+        #        # Try describing the image with GPTV
+        #        llm_client = kwargs.get("llm_client")
+        #        llm_model = kwargs.get("llm_model")
+        #        if llm_client is not None and llm_model is not None:
+        #            md_content += (
+        #                "\n# Description:\n"
+        #                + self._get_llm_description(
+        #                    local_path,
+        #                    extension,
+        #                    llm_client,
+        #                    llm_model,
+        #                    prompt=kwargs.get("llm_prompt"),
+        #                ).strip()
+        #                + "\n"
+        #            )

        return DocumentConverterResult(
            markdown=md_content,
        )

-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
-        if prompt is None or prompt.strip() == "":
-            prompt = "Write a detailed caption for this image."

-        data_uri = ""
-        with open(local_path, "rb") as image_file:
-            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
-            if content_type is None:
-                content_type = "image/jpeg"
-            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": data_uri,
-                        },
-                    },
-                ],
-            }
-        ]
-
-        response = client.chat.completions.create(model=model, messages=messages)
-        return response.choices[0].message.content
+#    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+#        if prompt is None or prompt.strip() == "":
+#            prompt = "Write a detailed caption for this image."
+#
+#        data_uri = ""
+#        with open(local_path, "rb") as image_file:
+#            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+#            if content_type is None:
+#                content_type = "image/jpeg"
+#            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+#            data_uri = f"data:{content_type};base64,{image_base64}"
+#
+#        messages = [
+#            {
+#                "role": "user",
+#                "content": [
+#                    {"type": "text", "text": prompt},
+#                    {
+#                        "type": "image_url",
+#                        "image_url": {
+#                            "url": data_uri,
+#                        },
+#                    },
+#                ],
+#            }
+#        ]
+#
+#        response = client.chat.completions.create(model=model, messages=messages)
+#        return response.choices[0].message.content