Merge b38ece12be into bf6a15e9b5

2025-02-05 16:54:51 +00:00 · 2025-02-05 16:54:51 +00:00 · d7fa9425b0
commit d7fa9425b0
parent bf6a15e9b5 b38ece12be
4 changed files with 132 additions and 55 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -1,4 +1,5 @@
 # type: ignore
 from io import BytesIO
 import base64
 import binascii
 import copy
@ -79,12 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
    - Altering the default heading style to use '#', '##', etc.
    - Removing javascript hyperlinks.
-    - Truncating images with large data:uri sources.
+    - Using mlm for transcription the images, otherwise, truncation images with large data:uri sources.
    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
    """
    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
        self.mlm_client = options.get("mlm_client")
        self.mlm_model = options.get("mlm_model")
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
@ -138,6 +142,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        alt = el.attrs.get("alt", None) or ""
        src = el.attrs.get("src", None) or ""
        title = el.attrs.get("title", None) or ""      
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        if (
            convert_as_inline
@ -146,8 +151,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            return alt
        # Remove dataURIs
-        if src.startswith("data:"):
+        if src.startswith("data:image/"):
-            src = src.split(",")[0] + "..."
+            if self.mlm_client is not None and self.mlm_model is not None:
                md = ImageConverter()
                result = md._convert(src, mlm_client=self.mlm_client, mlm_model=self.mlm_model)
                src = result.text_content if result is not None else src.split(",")[0] + "..."              
            else:
                src = src.split(",")[0] + "..."
        return "![%s](%s%s)" % (alt, src, title_part)
@ -212,11 +222,11 @@ class HtmlConverter(DocumentConverter):
        result = None
        with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
+            result = self._convert(fh.read(), **kwargs)
        return result
-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+    def _convert(self, html_content: str, **kwargs) -> Union[None, DocumentConverterResult]:
        """Helper function that converts and HTML string."""
        # Parse the string
@ -229,10 +239,14 @@ class HtmlConverter(DocumentConverter):
        # Print only the main content
        body_elm = soup.find("body")
        webpage_text = ""
        # add mlm_client and mlm_model to the options
        #options = copy.deepcopy(kwargs)
        if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
        assert isinstance(webpage_text, str)
@ -726,7 +740,7 @@ class DocxConverter(HtmlConverter):
            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html_content = result.value
-            result = self._convert(html_content)
+            result = self._convert(html_content, **kwargs)
        return result
@ -791,6 +805,8 @@ class PptxConverter(HtmlConverter):
            return None
        md_content = ""
        self._mlm_client = kwargs.get("mlm_client")
        self._mlm_model = kwargs.get("mlm_model")     
        presentation = pptx.Presentation(local_path)
        slide_num = 0
@ -819,6 +835,7 @@ class PptxConverter(HtmlConverter):
                        + filename
                        + ")\n"
                    )
                    md_content += self._convert_image_to_markdown(shape)
                # Tables
                if self._is_table(shape):
@ -863,6 +880,29 @@ class PptxConverter(HtmlConverter):
            text_content=md_content.strip(),
        )
    def _convert_image_to_markdown(self, shape) -> str:   
        if not self._is_picture(shape):
            return ""
        image_converter = ImageConverter() if (self._mlm_client is not None) and (self._mlm_model is not None) else None
        if image_converter is not None:            
            image = shape.image
            content_type = image.content_type  
            blob = image.blob
            try:
                ext = f"data:{content_type};base64"
                image_base64_uri = f"{ext},{base64.b64encode(blob).decode('utf-8')}"
                image_description = image_converter._convert(image_base64_uri, mlm_client=self._mlm_client, mlm_model=self._mlm_model)
                return ("\n" + image_description.text_content.strip() + "\n")
            except Exception as e:
                print("Error converting image to markdown")
                sys.stderr.write(f"Error converting image to markdown: {e}")                
        return ""
    def _is_picture(self, shape):
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
            return True
@ -1050,6 +1090,36 @@ class ImageConverter(MediaConverter):
    """
    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """
    def _convert(self, data_base64_uri, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not an image
        try:
            content_type = data_base64_uri.split(",")[0].split(";")[0]
            if content_type.lower() not in ["data:image/jpg", "data:image/jpeg", "data:image/png"]:
                return None
        except Exception:
            return None
        # Try describing the image with GPTV
        mlm_client = kwargs.get("mlm_client")
        mlm_model = kwargs.get("mlm_model")
        md_content = ""
        if mlm_client is not None and mlm_model is not None:            
            md_content = (
                "\n# Image Description:\n"
                + self._get_mlm_description(
                    data_base64_uri,
                    mlm_client,
                    mlm_model,
                    prompt=kwargs.get("mlm_prompt"),
                ).strip()
                + "\n"
            )
        return DocumentConverterResult(
            title=None,
            text_content=md_content,
        )
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not an image
@ -1077,38 +1147,28 @@ class ImageConverter(MediaConverter):
                if f in metadata:
                    md_content += f"{f}: {metadata[f]}\n"
-        # Try describing the image with GPTV
+        image_base64_uri = self._get_image_base64(local_path, extension)
-        llm_client = kwargs.get("llm_client")
+        md_content += self._convert(image_base64_uri, **kwargs).text_content        
        llm_model = kwargs.get("llm_model")
        if llm_client is not None and llm_model is not None:
            md_content += (
                "\n# Description:\n"
                + self._get_llm_description(
                    local_path,
                    extension,
                    llm_client,
                    llm_model,
                    prompt=kwargs.get("llm_prompt"),
                ).strip()
                + "\n"
            )
        return DocumentConverterResult(
            title=None,
            text_content=md_content,
        )
-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_image_base64(self, local_path, extension):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        data_uri = ""
        with open(local_path, "rb") as image_file:
            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
            if content_type is None:
                content_type = "image/jpeg"
            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
+            
            return f"data:{content_type};base64,{image_base64}"  
    def _get_mlm_description(self, data_base64_uri, client, model, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
        messages = [
            {
@ -1118,7 +1178,7 @@ class ImageConverter(MediaConverter):
                    {
                        "type": "image_url",
                        "image_url": {
-                            "url": data_uri,
+                            "url": data_base64_uri,
                        },
                    },
                ],
@ -1128,7 +1188,6 @@ class ImageConverter(MediaConverter):
        response = client.chat.completions.create(model=model, messages=messages)
        return response.choices[0].message.content
 class OutlookMsgConverter(DocumentConverter):
    """Converts Outlook .msg files to markdown by extracting email metadata and content.
@ -1565,6 +1624,9 @@ class MarkItDown:
            # Convert
            result = self._convert(temp_path, extensions, **kwargs)
        except Exception as e:
            sys.stderr.write(f"Error converting stream to markdown: {e}")
            pass
        # Clean up
        finally:
            try:
@ -1636,22 +1698,22 @@ class MarkItDown:
    ) -> DocumentConverterResult:
        error_trace = ""
        for ext in extensions + [None]:  # Try last with no extension
            _kwargs = copy.deepcopy(kwargs)
            # Overwrite file_extension appropriately
            if ext is None:
                if "file_extension" in _kwargs:
                    del _kwargs["file_extension"]
            else:
                _kwargs.update({"file_extension": ext})
            # Copy any additional global options
            if "mlm_client" not in _kwargs and self._llm_client is not None:
                _kwargs["mlm_client"] = self._llm_client
            if "mlm_model" not in _kwargs and self._llm_model is not None:
                _kwargs["mlm_model"] = self._llm_model      
            for converter in self._page_converters:
                _kwargs = copy.deepcopy(kwargs)
                # Overwrite file_extension appropriately
                if ext is None:
                    if "file_extension" in _kwargs:
                        del _kwargs["file_extension"]
                else:
                    _kwargs.update({"file_extension": ext})
                # Copy any additional global options
                if "llm_client" not in _kwargs and self._llm_client is not None:
                    _kwargs["llm_client"] = self._llm_client
                if "llm_model" not in _kwargs and self._llm_model is not None:
                    _kwargs["llm_model"] = self._llm_model
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -1,8 +1,9 @@
 #!/usr/bin/env python3 -m pytest
 import io
 import os
 from dotenv import load_dotenv
 import shutil
-
+from openai import OpenAI, AzureOpenAI
 import pytest
 import requests
@ -134,6 +135,7 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]
 CSV_CP932_TEST_STRINGS = [
    "名前,年齢,住所",
    "佐藤太郎,30,東京",
@ -189,8 +191,20 @@ def test_markitdown_remote() -> None:
    #     assert test_string in result.text_content
-def test_markitdown_local() -> None:
+def test_markitdown_local(use_mlm = False) -> None:
-    markitdown = MarkItDown()
+    if (use_mlm):
        load_dotenv()   
        client = AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
            api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
        )
        llm_model="gpt-4oModel"
        markitdown = MarkItDown(llm_client=client, llm_model=llm_model)
    else:
        markitdown = MarkItDown()
    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
@ -305,7 +319,6 @@ def test_markitdown_exiftool() -> None:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
        assert target in result.text_content
 def test_markitdown_deprecation() -> None:
    try:
        with catch_warnings(record=True) as w:
@ -361,6 +374,8 @@ def test_markitdown_llm() -> None:
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    test_markitdown_remote()
    test_markitdown_local(True)
    # test_markitdown_remote()
    # test_markitdown_local()
    test_markitdown_exiftool()