Merge b38ece12be into bf6a15e9b5
This commit is contained in:
commit
d7fa9425b0
4 changed files with 132 additions and 55 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
# type: ignore
|
# type: ignore
|
||||||
|
from io import BytesIO
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
import binascii
|
||||||
import copy
|
import copy
|
||||||
|
|
@ -79,12 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
||||||
- Altering the default heading style to use '#', '##', etc.
|
- Altering the default heading style to use '#', '##', etc.
|
||||||
- Removing javascript hyperlinks.
|
- Removing javascript hyperlinks.
|
||||||
- Truncating images with large data:uri sources.
|
- Using mlm for transcription the images, otherwise, truncation images with large data:uri sources.
|
||||||
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, **options: Any):
|
def __init__(self, **options: Any):
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||||
|
|
||||||
|
self.mlm_client = options.get("mlm_client")
|
||||||
|
self.mlm_model = options.get("mlm_model")
|
||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
|
|
@ -138,6 +142,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
alt = el.attrs.get("alt", None) or ""
|
alt = el.attrs.get("alt", None) or ""
|
||||||
src = el.attrs.get("src", None) or ""
|
src = el.attrs.get("src", None) or ""
|
||||||
title = el.attrs.get("title", None) or ""
|
title = el.attrs.get("title", None) or ""
|
||||||
|
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
if (
|
if (
|
||||||
convert_as_inline
|
convert_as_inline
|
||||||
|
|
@ -146,8 +151,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
return alt
|
return alt
|
||||||
|
|
||||||
# Remove dataURIs
|
# Remove dataURIs
|
||||||
if src.startswith("data:"):
|
if src.startswith("data:image/"):
|
||||||
src = src.split(",")[0] + "..."
|
if self.mlm_client is not None and self.mlm_model is not None:
|
||||||
|
md = ImageConverter()
|
||||||
|
result = md._convert(src, mlm_client=self.mlm_client, mlm_model=self.mlm_model)
|
||||||
|
src = result.text_content if result is not None else src.split(",")[0] + "..."
|
||||||
|
else:
|
||||||
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, src, title_part)
|
||||||
|
|
||||||
|
|
@ -212,11 +222,11 @@ class HtmlConverter(DocumentConverter):
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||||
result = self._convert(fh.read())
|
result = self._convert(fh.read(), **kwargs)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
def _convert(self, html_content: str, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
"""Helper function that converts and HTML string."""
|
"""Helper function that converts and HTML string."""
|
||||||
|
|
||||||
# Parse the string
|
# Parse the string
|
||||||
|
|
@ -229,10 +239,14 @@ class HtmlConverter(DocumentConverter):
|
||||||
# Print only the main content
|
# Print only the main content
|
||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
webpage_text = ""
|
||||||
|
|
||||||
|
# add mlm_client and mlm_model to the options
|
||||||
|
#options = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
|
@ -726,7 +740,7 @@ class DocxConverter(HtmlConverter):
|
||||||
|
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
||||||
html_content = result.value
|
html_content = result.value
|
||||||
result = self._convert(html_content)
|
result = self._convert(html_content, **kwargs)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -791,6 +805,8 @@ class PptxConverter(HtmlConverter):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
self._mlm_client = kwargs.get("mlm_client")
|
||||||
|
self._mlm_model = kwargs.get("mlm_model")
|
||||||
|
|
||||||
presentation = pptx.Presentation(local_path)
|
presentation = pptx.Presentation(local_path)
|
||||||
slide_num = 0
|
slide_num = 0
|
||||||
|
|
@ -819,6 +835,7 @@ class PptxConverter(HtmlConverter):
|
||||||
+ filename
|
+ filename
|
||||||
+ ")\n"
|
+ ")\n"
|
||||||
)
|
)
|
||||||
|
md_content += self._convert_image_to_markdown(shape)
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
|
|
@ -863,6 +880,29 @@ class PptxConverter(HtmlConverter):
|
||||||
text_content=md_content.strip(),
|
text_content=md_content.strip(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _convert_image_to_markdown(self, shape) -> str:
|
||||||
|
if not self._is_picture(shape):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
image_converter = ImageConverter() if (self._mlm_client is not None) and (self._mlm_model is not None) else None
|
||||||
|
|
||||||
|
if image_converter is not None:
|
||||||
|
image = shape.image
|
||||||
|
content_type = image.content_type
|
||||||
|
blob = image.blob
|
||||||
|
|
||||||
|
try:
|
||||||
|
ext = f"data:{content_type};base64"
|
||||||
|
image_base64_uri = f"{ext},{base64.b64encode(blob).decode('utf-8')}"
|
||||||
|
image_description = image_converter._convert(image_base64_uri, mlm_client=self._mlm_client, mlm_model=self._mlm_model)
|
||||||
|
|
||||||
|
return ("\n" + image_description.text_content.strip() + "\n")
|
||||||
|
except Exception as e:
|
||||||
|
print("Error converting image to markdown")
|
||||||
|
sys.stderr.write(f"Error converting image to markdown: {e}")
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
def _is_picture(self, shape):
|
def _is_picture(self, shape):
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||||
return True
|
return True
|
||||||
|
|
@ -1050,6 +1090,36 @@ class ImageConverter(MediaConverter):
|
||||||
"""
|
"""
|
||||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||||
"""
|
"""
|
||||||
|
def _convert(self, data_base64_uri, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not an image
|
||||||
|
try:
|
||||||
|
content_type = data_base64_uri.split(",")[0].split(";")[0]
|
||||||
|
if content_type.lower() not in ["data:image/jpg", "data:image/jpeg", "data:image/png"]:
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Try describing the image with GPTV
|
||||||
|
mlm_client = kwargs.get("mlm_client")
|
||||||
|
mlm_model = kwargs.get("mlm_model")
|
||||||
|
md_content = ""
|
||||||
|
|
||||||
|
if mlm_client is not None and mlm_model is not None:
|
||||||
|
md_content = (
|
||||||
|
"\n# Image Description:\n"
|
||||||
|
+ self._get_mlm_description(
|
||||||
|
data_base64_uri,
|
||||||
|
mlm_client,
|
||||||
|
mlm_model,
|
||||||
|
prompt=kwargs.get("mlm_prompt"),
|
||||||
|
).strip()
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=md_content,
|
||||||
|
)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not an image
|
# Bail if not an image
|
||||||
|
|
@ -1077,38 +1147,28 @@ class ImageConverter(MediaConverter):
|
||||||
if f in metadata:
|
if f in metadata:
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
# Try describing the image with GPTV
|
image_base64_uri = self._get_image_base64(local_path, extension)
|
||||||
llm_client = kwargs.get("llm_client")
|
md_content += self._convert(image_base64_uri, **kwargs).text_content
|
||||||
llm_model = kwargs.get("llm_model")
|
|
||||||
if llm_client is not None and llm_model is not None:
|
|
||||||
md_content += (
|
|
||||||
"\n# Description:\n"
|
|
||||||
+ self._get_llm_description(
|
|
||||||
local_path,
|
|
||||||
extension,
|
|
||||||
llm_client,
|
|
||||||
llm_model,
|
|
||||||
prompt=kwargs.get("llm_prompt"),
|
|
||||||
).strip()
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=md_content,
|
text_content=md_content,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
def _get_image_base64(self, local_path, extension):
|
||||||
if prompt is None or prompt.strip() == "":
|
|
||||||
prompt = "Write a detailed caption for this image."
|
|
||||||
|
|
||||||
data_uri = ""
|
|
||||||
with open(local_path, "rb") as image_file:
|
with open(local_path, "rb") as image_file:
|
||||||
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
||||||
if content_type is None:
|
if content_type is None:
|
||||||
content_type = "image/jpeg"
|
content_type = "image/jpeg"
|
||||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
|
||||||
|
return f"data:{content_type};base64,{image_base64}"
|
||||||
|
|
||||||
|
def _get_mlm_description(self, data_base64_uri, client, model, prompt=None):
|
||||||
|
if prompt is None or prompt.strip() == "":
|
||||||
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
|
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
|
|
@ -1118,7 +1178,7 @@ class ImageConverter(MediaConverter):
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": data_uri,
|
"url": data_base64_uri,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
@ -1128,7 +1188,6 @@ class ImageConverter(MediaConverter):
|
||||||
response = client.chat.completions.create(model=model, messages=messages)
|
response = client.chat.completions.create(model=model, messages=messages)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
class OutlookMsgConverter(DocumentConverter):
|
class OutlookMsgConverter(DocumentConverter):
|
||||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||||
|
|
||||||
|
|
@ -1565,6 +1624,9 @@ class MarkItDown:
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
result = self._convert(temp_path, extensions, **kwargs)
|
result = self._convert(temp_path, extensions, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
sys.stderr.write(f"Error converting stream to markdown: {e}")
|
||||||
|
pass
|
||||||
# Clean up
|
# Clean up
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
|
|
@ -1636,22 +1698,22 @@ class MarkItDown:
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
error_trace = ""
|
error_trace = ""
|
||||||
for ext in extensions + [None]: # Try last with no extension
|
for ext in extensions + [None]: # Try last with no extension
|
||||||
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
# Overwrite file_extension appropriately
|
||||||
|
if ext is None:
|
||||||
|
if "file_extension" in _kwargs:
|
||||||
|
del _kwargs["file_extension"]
|
||||||
|
else:
|
||||||
|
_kwargs.update({"file_extension": ext})
|
||||||
|
|
||||||
|
# Copy any additional global options
|
||||||
|
if "mlm_client" not in _kwargs and self._llm_client is not None:
|
||||||
|
_kwargs["mlm_client"] = self._llm_client
|
||||||
|
|
||||||
|
if "mlm_model" not in _kwargs and self._llm_model is not None:
|
||||||
|
_kwargs["mlm_model"] = self._llm_model
|
||||||
|
|
||||||
for converter in self._page_converters:
|
for converter in self._page_converters:
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
|
||||||
|
|
||||||
# Overwrite file_extension appropriately
|
|
||||||
if ext is None:
|
|
||||||
if "file_extension" in _kwargs:
|
|
||||||
del _kwargs["file_extension"]
|
|
||||||
else:
|
|
||||||
_kwargs.update({"file_extension": ext})
|
|
||||||
|
|
||||||
# Copy any additional global options
|
|
||||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
|
||||||
_kwargs["llm_client"] = self._llm_client
|
|
||||||
|
|
||||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
|
||||||
_kwargs["llm_model"] = self._llm_model
|
|
||||||
|
|
||||||
if "style_map" not in _kwargs and self._style_map is not None:
|
if "style_map" not in _kwargs and self._style_map is not None:
|
||||||
_kwargs["style_map"] = self._style_map
|
_kwargs["style_map"] = self._style_map
|
||||||
|
|
|
||||||
BIN
tests/test_files/test.docx
vendored
BIN
tests/test_files/test.docx
vendored
Binary file not shown.
BIN
tests/test_files/test.pptx
vendored
BIN
tests/test_files/test.pptx
vendored
Binary file not shown.
|
|
@ -1,8 +1,9 @@
|
||||||
#!/usr/bin/env python3 -m pytest
|
#!/usr/bin/env python3 -m pytest
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
import shutil
|
import shutil
|
||||||
|
from openai import OpenAI, AzureOpenAI
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
@ -134,6 +135,7 @@ SERP_TEST_EXCLUDES = [
|
||||||
"data:image/svg+xml,%3Csvg%20width%3D",
|
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
CSV_CP932_TEST_STRINGS = [
|
CSV_CP932_TEST_STRINGS = [
|
||||||
"名前,年齢,住所",
|
"名前,年齢,住所",
|
||||||
"佐藤太郎,30,東京",
|
"佐藤太郎,30,東京",
|
||||||
|
|
@ -189,8 +191,20 @@ def test_markitdown_remote() -> None:
|
||||||
# assert test_string in result.text_content
|
# assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown_local() -> None:
|
def test_markitdown_local(use_mlm = False) -> None:
|
||||||
markitdown = MarkItDown()
|
if (use_mlm):
|
||||||
|
load_dotenv()
|
||||||
|
client = AzureOpenAI(
|
||||||
|
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
||||||
|
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
|
||||||
|
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
|
||||||
|
)
|
||||||
|
llm_model="gpt-4oModel"
|
||||||
|
|
||||||
|
markitdown = MarkItDown(llm_client=client, llm_model=llm_model)
|
||||||
|
else:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||||
|
|
@ -305,7 +319,6 @@ def test_markitdown_exiftool() -> None:
|
||||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||||
assert target in result.text_content
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown_deprecation() -> None:
|
def test_markitdown_deprecation() -> None:
|
||||||
try:
|
try:
|
||||||
with catch_warnings(record=True) as w:
|
with catch_warnings(record=True) as w:
|
||||||
|
|
@ -361,6 +374,8 @@ def test_markitdown_llm() -> None:
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
test_markitdown_remote()
|
||||||
|
test_markitdown_local(True)
|
||||||
# test_markitdown_remote()
|
# test_markitdown_remote()
|
||||||
# test_markitdown_local()
|
# test_markitdown_local()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue