Merge 928ddab91a into 73ba69d8cd

2025-02-09 16:44:35 +00:00 · 2025-02-09 16:44:35 +00:00 · f86147a536
commit f86147a536
parent 73ba69d8cd 928ddab91a
1 changed files with 76 additions and 0 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -734,6 +734,29 @@ class DocxConverter(HtmlConverter):
            html_content = result.value
            result = self._convert(html_content)
            
+        # Extract any base64 encoded images from the HTML
+        descriptions = []
+        if kwargs.get("llm_client") and kwargs.get("llm_model"):
+            for match in re.finditer(r'data:image/[^;]+;base64,([^"\']+)', html_content):
+                img_converter = ImageConverter()
+                descriptions.append(img_converter.convert_from_base64(match.group(1),'.png',**kwargs))
+
+        # Replace each base64 image with its description
+        if descriptions and result:
+            text_content = result.text_content
+
+            # Find all base64 image markdown patterns
+            base64_pattern = r'!\[[\s\S]*?\]\(data:image/[a-z]+;base64.*?\)'
+
+            # Find all base64 image markdown patterns
+            matches = list(re.finditer(base64_pattern, text_content))
+            
+            # Replace each match with corresponding description
+            for i, match in enumerate(matches):
+                if i < len(descriptions):
+                    text_content = text_content.replace(match.group(), f'[Image description {i}] \n{descriptions[i]}\n[End Image description {i}]')
+            result.text_content = text_content
+
        return result


@ -1184,6 +1207,59 @@ class ImageConverter(MediaConverter):
        response = client.chat.completions.create(model=model, messages=messages)
        return response.choices[0].message.content
    
+    def _get_llm_description_from_base64(
+        self, 
+        base64_str: str, 
+        extension: str, 
+        client: Any, 
+        model: str, 
+        prompt: Optional[str] = None
+    ) -> str:
+        """Get LLM description for a base64-encoded image string."""
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
+
+        # Remove data URI prefix if present
+        if ',' in base64_str:
+            base64_str = base64_str.split(',')[1]
+
+        # Create data URI
+        content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+        if content_type is None:
+            content_type = "image/jpeg"
+
+        data_uri = f"data:{content_type};base64,{base64_str}"
+        messages = [
+            {
+                "role": "user", 
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                ],
+            }
+        ]
+
+        response = client.chat.completions.create(model=model, messages=messages)
+        return response.choices[0].message.content
+
+    def convert_from_base64(
+        self, 
+        base64_str: str, 
+        extension: str, 
+        **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        """Convert a base64-encoded image string to markdown."""
+        client = kwargs.get("llm_client")
+        model = kwargs.get("llm_model")
+        prompt = kwargs.get("llm_prompt")
+        result = self._get_llm_description_from_base64(base64_str, extension, client, model, prompt)
+        return result
+

 class OutlookMsgConverter(DocumentConverter):
    """Converts Outlook .msg files to markdown by extracting email metadata and content.