This commit is contained in:
FeuRicardo 2025-02-05 16:54:51 +00:00 committed by GitHub
commit d7fa9425b0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 132 additions and 55 deletions

View file

@ -1,4 +1,5 @@
# type: ignore # type: ignore
from io import BytesIO
import base64 import base64
import binascii import binascii
import copy import copy
@ -79,12 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
- Altering the default heading style to use '#', '##', etc. - Altering the default heading style to use '#', '##', etc.
- Removing javascript hyperlinks. - Removing javascript hyperlinks.
- Truncating images with large data:uri sources. - Using mlm for transcription the images, otherwise, truncation images with large data:uri sources.
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
""" """
def __init__(self, **options: Any): def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get("heading_style", markdownify.ATX)
self.mlm_client = options.get("mlm_client")
self.mlm_model = options.get("mlm_model")
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
@ -138,6 +142,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
alt = el.attrs.get("alt", None) or "" alt = el.attrs.get("alt", None) or ""
src = el.attrs.get("src", None) or "" src = el.attrs.get("src", None) or ""
title = el.attrs.get("title", None) or "" title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else "" title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if ( if (
convert_as_inline convert_as_inline
@ -146,7 +151,12 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt return alt
# Remove dataURIs # Remove dataURIs
if src.startswith("data:"): if src.startswith("data:image/"):
if self.mlm_client is not None and self.mlm_model is not None:
md = ImageConverter()
result = md._convert(src, mlm_client=self.mlm_client, mlm_model=self.mlm_model)
src = result.text_content if result is not None else src.split(",")[0] + "..."
else:
src = src.split(",")[0] + "..." src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part) return "![%s](%s%s)" % (alt, src, title_part)
@ -212,11 +222,11 @@ class HtmlConverter(DocumentConverter):
result = None result = None
with open(local_path, "rt", encoding="utf-8") as fh: with open(local_path, "rt", encoding="utf-8") as fh:
result = self._convert(fh.read()) result = self._convert(fh.read(), **kwargs)
return result return result
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: def _convert(self, html_content: str, **kwargs) -> Union[None, DocumentConverterResult]:
"""Helper function that converts and HTML string.""" """Helper function that converts and HTML string."""
# Parse the string # Parse the string
@ -229,10 +239,14 @@ class HtmlConverter(DocumentConverter):
# Print only the main content # Print only the main content
body_elm = soup.find("body") body_elm = soup.find("body")
webpage_text = "" webpage_text = ""
# add mlm_client and mlm_model to the options
#options = copy.deepcopy(kwargs)
if body_elm: if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm) webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify().convert_soup(soup) webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
@ -726,7 +740,7 @@ class DocxConverter(HtmlConverter):
result = mammoth.convert_to_html(docx_file, style_map=style_map) result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value html_content = result.value
result = self._convert(html_content) result = self._convert(html_content, **kwargs)
return result return result
@ -791,6 +805,8 @@ class PptxConverter(HtmlConverter):
return None return None
md_content = "" md_content = ""
self._mlm_client = kwargs.get("mlm_client")
self._mlm_model = kwargs.get("mlm_model")
presentation = pptx.Presentation(local_path) presentation = pptx.Presentation(local_path)
slide_num = 0 slide_num = 0
@ -819,6 +835,7 @@ class PptxConverter(HtmlConverter):
+ filename + filename
+ ")\n" + ")\n"
) )
md_content += self._convert_image_to_markdown(shape)
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):
@ -863,6 +880,29 @@ class PptxConverter(HtmlConverter):
text_content=md_content.strip(), text_content=md_content.strip(),
) )
def _convert_image_to_markdown(self, shape) -> str:
if not self._is_picture(shape):
return ""
image_converter = ImageConverter() if (self._mlm_client is not None) and (self._mlm_model is not None) else None
if image_converter is not None:
image = shape.image
content_type = image.content_type
blob = image.blob
try:
ext = f"data:{content_type};base64"
image_base64_uri = f"{ext},{base64.b64encode(blob).decode('utf-8')}"
image_description = image_converter._convert(image_base64_uri, mlm_client=self._mlm_client, mlm_model=self._mlm_model)
return ("\n" + image_description.text_content.strip() + "\n")
except Exception as e:
print("Error converting image to markdown")
sys.stderr.write(f"Error converting image to markdown: {e}")
return ""
def _is_picture(self, shape): def _is_picture(self, shape):
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
return True return True
@ -1050,6 +1090,36 @@ class ImageConverter(MediaConverter):
""" """
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def _convert(self, data_base64_uri, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
try:
content_type = data_base64_uri.split(",")[0].split(";")[0]
if content_type.lower() not in ["data:image/jpg", "data:image/jpeg", "data:image/png"]:
return None
except Exception:
return None
# Try describing the image with GPTV
mlm_client = kwargs.get("mlm_client")
mlm_model = kwargs.get("mlm_model")
md_content = ""
if mlm_client is not None and mlm_model is not None:
md_content = (
"\n# Image Description:\n"
+ self._get_mlm_description(
data_base64_uri,
mlm_client,
mlm_model,
prompt=kwargs.get("mlm_prompt"),
).strip()
+ "\n"
)
return DocumentConverterResult(
title=None,
text_content=md_content,
)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image # Bail if not an image
@ -1077,38 +1147,28 @@ class ImageConverter(MediaConverter):
if f in metadata: if f in metadata:
md_content += f"{f}: {metadata[f]}\n" md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPTV image_base64_uri = self._get_image_base64(local_path, extension)
llm_client = kwargs.get("llm_client") md_content += self._convert(image_base64_uri, **kwargs).text_content
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
md_content += (
"\n# Description:\n"
+ self._get_llm_description(
local_path,
extension,
llm_client,
llm_model,
prompt=kwargs.get("llm_prompt"),
).strip()
+ "\n"
)
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=md_content, text_content=md_content,
) )
def _get_llm_description(self, local_path, extension, client, model, prompt=None): def _get_image_base64(self, local_path, extension):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
data_uri = ""
with open(local_path, "rb") as image_file: with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension) content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None: if content_type is None:
content_type = "image/jpeg" content_type = "image/jpeg"
image_base64 = base64.b64encode(image_file.read()).decode("utf-8") image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
return f"data:{content_type};base64,{image_base64}"
def _get_mlm_description(self, data_base64_uri, client, model, prompt=None):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
messages = [ messages = [
{ {
@ -1118,7 +1178,7 @@ class ImageConverter(MediaConverter):
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": data_uri, "url": data_base64_uri,
}, },
}, },
], ],
@ -1128,7 +1188,6 @@ class ImageConverter(MediaConverter):
response = client.chat.completions.create(model=model, messages=messages) response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content return response.choices[0].message.content
class OutlookMsgConverter(DocumentConverter): class OutlookMsgConverter(DocumentConverter):
"""Converts Outlook .msg files to markdown by extracting email metadata and content. """Converts Outlook .msg files to markdown by extracting email metadata and content.
@ -1565,6 +1624,9 @@ class MarkItDown:
# Convert # Convert
result = self._convert(temp_path, extensions, **kwargs) result = self._convert(temp_path, extensions, **kwargs)
except Exception as e:
sys.stderr.write(f"Error converting stream to markdown: {e}")
pass
# Clean up # Clean up
finally: finally:
try: try:
@ -1636,9 +1698,7 @@ class MarkItDown:
) -> DocumentConverterResult: ) -> DocumentConverterResult:
error_trace = "" error_trace = ""
for ext in extensions + [None]: # Try last with no extension for ext in extensions + [None]: # Try last with no extension
for converter in self._page_converters:
_kwargs = copy.deepcopy(kwargs) _kwargs = copy.deepcopy(kwargs)
# Overwrite file_extension appropriately # Overwrite file_extension appropriately
if ext is None: if ext is None:
if "file_extension" in _kwargs: if "file_extension" in _kwargs:
@ -1647,11 +1707,13 @@ class MarkItDown:
_kwargs.update({"file_extension": ext}) _kwargs.update({"file_extension": ext})
# Copy any additional global options # Copy any additional global options
if "llm_client" not in _kwargs and self._llm_client is not None: if "mlm_client" not in _kwargs and self._llm_client is not None:
_kwargs["llm_client"] = self._llm_client _kwargs["mlm_client"] = self._llm_client
if "llm_model" not in _kwargs and self._llm_model is not None: if "mlm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model _kwargs["mlm_model"] = self._llm_model
for converter in self._page_converters:
if "style_map" not in _kwargs and self._style_map is not None: if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map _kwargs["style_map"] = self._style_map

Binary file not shown.

Binary file not shown.

View file

@ -1,8 +1,9 @@
#!/usr/bin/env python3 -m pytest #!/usr/bin/env python3 -m pytest
import io import io
import os import os
from dotenv import load_dotenv
import shutil import shutil
from openai import OpenAI, AzureOpenAI
import pytest import pytest
import requests import requests
@ -134,6 +135,7 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D", "data:image/svg+xml,%3Csvg%20width%3D",
] ]
CSV_CP932_TEST_STRINGS = [ CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所", "名前,年齢,住所",
"佐藤太郎,30,東京", "佐藤太郎,30,東京",
@ -189,9 +191,21 @@ def test_markitdown_remote() -> None:
# assert test_string in result.text_content # assert test_string in result.text_content
def test_markitdown_local() -> None: def test_markitdown_local(use_mlm = False) -> None:
if (use_mlm):
load_dotenv()
client = AzureOpenAI(
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
llm_model="gpt-4oModel"
markitdown = MarkItDown(llm_client=client, llm_model=llm_model)
else:
markitdown = MarkItDown() markitdown = MarkItDown()
# Test XLSX processing # Test XLSX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
validate_strings(result, XLSX_TEST_STRINGS) validate_strings(result, XLSX_TEST_STRINGS)
@ -305,7 +319,6 @@ def test_markitdown_exiftool() -> None:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None: def test_markitdown_deprecation() -> None:
try: try:
with catch_warnings(record=True) as w: with catch_warnings(record=True) as w:
@ -361,6 +374,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local(True)
# test_markitdown_remote() # test_markitdown_remote()
# test_markitdown_local() # test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()