Support Async LLM Client, reuse llm_caption helper

2025-04-30 23:16:22 +02:00 · 2025-04-30 23:16:22 +02:00 · 4a2f793869
commit 4a2f793869
parent 041be54471
5 changed files with 34 additions and 69 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -8,7 +8,7 @@ from ._markitdown import (
    PRIORITY_SPECIFIC_FILE_FORMAT,
    PRIORITY_GENERIC_FILE_FORMAT,
 )
-from ._base_converter import DocumentConverterResult, DocumentConverter
+from ._base_converter import DocumentConverterResult, AsyncDocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
    MarkItDownException,
@ -23,6 +23,7 @@ __all__ = [
    "MarkItDown",
    "DocumentConverter",
    "DocumentConverterResult",
    "AsyncDocumentConverterResult",
    "MarkItDownException",
    "MissingDependencyException",
    "FailedConversionAttempt",
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -1,7 +1,4 @@
-import os
+from typing import Any, BinaryIO, Optional, Awaitable
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List
 from ._stream_info import StreamInfo
@ -41,6 +38,14 @@ class DocumentConverterResult:
        """Return the converted Markdown text."""
        return self.markdown
 class AsyncDocumentConverterResult:
    """The result of converting a document to Markdown."""
    def __init__(
        self,
        text_content: Awaitable[str],
    ):
        self.text_content = text_content
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -1,11 +1,9 @@
 import copy
 import mimetypes
 import os
 import re
 import sys
 import shutil
-import tempfile
+import asyncio
 import warnings
 import traceback
 import io
 from dataclasses import dataclass
@ -600,6 +598,9 @@ class MarkItDown:
                    finally:
                        file_stream.seek(cur_pos)
                if asyncio.iscoroutine(res):
                    return res
                if res is not None:
                    # Normalize the content
                    res.text_content = "\n".join(
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,9 +1,9 @@
-from typing import BinaryIO, Any, Union
+from typing import BinaryIO, Any
-import base64
+import asyncio
 import mimetypes
 from ._exiftool import exiftool_metadata
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, AsyncDocumentConverterResult
 from .._stream_info import StreamInfo
 from ._llm_caption import llm_caption
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "image/jpeg",
@ -69,7 +69,7 @@ class ImageConverter(DocumentConverter):
        llm_client = kwargs.get("llm_client")
        llm_model = kwargs.get("llm_model")
        if llm_client is not None and llm_model is not None:
-            llm_description = self._get_llm_description(
+            llm_description = llm_caption(
                file_stream,
                stream_info,
                client=llm_client,
@ -77,62 +77,14 @@ class ImageConverter(DocumentConverter):
                prompt=kwargs.get("llm_prompt"),
            )
            if asyncio.iscoroutine(llm_description):
                return AsyncDocumentConverterResult(
                    llm_description,
                )
            if llm_description is not None:
                md_content += "\n# Description:\n" + llm_description.strip() + "\n"
        return DocumentConverterResult(
            markdown=md_content,
-        )
+        )
    def _get_llm_description(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        *,
        client,
        model,
        prompt=None,
    ) -> Union[None, str]:
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        # Get the content type
        content_type = stream_info.mimetype
        if not content_type:
            content_type, _ = mimetypes.guess_type(
                "_dummy" + (stream_info.extension or "")
            )
        if not content_type:
            content_type = "application/octet-stream"
        # Convert to base64
        cur_pos = file_stream.tell()
        try:
            base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
        except Exception as e:
            return None
        finally:
            file_stream.seek(cur_pos)
        # Prepare the data-uri
        data_uri = f"data:{content_type};base64,{base64_image}"
        # Prepare the OpenAI API request
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": data_uri,
                        },
                    },
                ],
            }
        ]
        # Call the OpenAI API
        response = client.chat.completions.create(model=model, messages=messages)
        return response.choices[0].message.content
--- a/packages/markitdown/src/markitdown/converters/_llm_caption.py
+++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py
@ -1,12 +1,13 @@
-from typing import BinaryIO, Any, Union
+from typing import BinaryIO, Any, Union, Awaitable
 import base64
 import mimetypes
 import asyncio
 from .._stream_info import StreamInfo
 def llm_caption(
    file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
-) -> Union[None, str]:
+) -> Union[None, str, Awaitable[str]]:
    if prompt is None or prompt.strip() == "":
        prompt = "Write a detailed caption for this image."
@ -47,4 +48,9 @@ def llm_caption(
    # Call the OpenAI API
    response = client.chat.completions.create(model=model, messages=messages)
    if asyncio.iscoroutine(response):
        async def read_content():
            response = await response
            return response.choices[0].message.content
        return read_content()
    return response.choices[0].message.content