Merge 03e852bd5b into 041be54471
This commit is contained in:
commit
2670bb7396
2 changed files with 255 additions and 9 deletions
|
|
@ -4,6 +4,7 @@ import os
|
|||
import io
|
||||
import re
|
||||
import html
|
||||
import uuid
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
from operator import attrgetter
|
||||
|
|
@ -140,16 +141,60 @@ class PptxConverter(DocumentConverter):
|
|||
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||
|
||||
# If keep_data_uris is True, use base64 encoding for images
|
||||
if kwargs.get("keep_data_uris", False):
|
||||
uploader = kwargs.get("upload_handler")
|
||||
keep_data_uris = kwargs.get("keep_data_uris", False)
|
||||
|
||||
try:
|
||||
blob = shape.image.blob
|
||||
content_type = shape.image.content_type or "image/png"
|
||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
else:
|
||||
# A placeholder name
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += "\n\n"
|
||||
original_ext = os.path.splitext(shape.image.filename)[1] if shape.image.filename else ".jpg"
|
||||
unique_filename = f"{uuid.uuid4().hex}{original_ext}"
|
||||
|
||||
# Use uploader if available
|
||||
if uploader:
|
||||
meta = {
|
||||
"filename": unique_filename,
|
||||
"content_type": shape.image.content_type or "image/png",
|
||||
}
|
||||
image_url = uploader(blob, meta)
|
||||
|
||||
# Verify if a valid URL was returned
|
||||
if image_url and isinstance(image_url, str) and image_url.strip():
|
||||
md_content += f"\n\n"
|
||||
else:
|
||||
print(f"Warning: Upload handler returned invalid URL for {unique_filename}")
|
||||
# Fallback if uploader fails
|
||||
if keep_data_uris:
|
||||
content_type = shape.image.content_type or "image/png"
|
||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
else:
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += "\n\n"
|
||||
# No uploader but data URI retention is enabled
|
||||
elif keep_data_uris:
|
||||
content_type = shape.image.content_type or "image/png"
|
||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
# Default case: use filename only
|
||||
else:
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += "\n\n"
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing image: {str(e)}")
|
||||
# Default handling on error
|
||||
if keep_data_uris:
|
||||
try:
|
||||
blob = shape.image.blob
|
||||
content_type = shape.image.content_type or "image/png"
|
||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
except:
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += "\n\n"
|
||||
else:
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += "\n\n"
|
||||
|
||||
# Tables
|
||||
if self._is_table(shape):
|
||||
|
|
|
|||
201
packages/markitdown/tests/test_upload_handler.py
Normal file
201
packages/markitdown/tests/test_upload_handler.py
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
import os
|
||||
import pytest
|
||||
from markitdown import MarkItDown, StreamInfo
|
||||
import re
|
||||
|
||||
|
||||
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
@pytest.mark.parametrize("filename", ["test.pptx"])
|
||||
def test_convert_with_upload_handler_url_format(filename):
|
||||
"""Tests if the URL returned by the upload_handler is included in the converted markdown in the correct format when converting a PPTX file."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||
|
||||
|
||||
def validation_upload_handler(image_blob, meta):
|
||||
assert "filename" in meta
|
||||
assert re.match(r"[a-f0-9]{32}\.[a-zA-Z]+", meta["filename"]) # Check if filename is in UUID format
|
||||
return f"http://test.com/{meta['filename']}"
|
||||
|
||||
with open(file_path, "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream,
|
||||
stream_info=StreamInfo(
|
||||
extension=".pptx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
),
|
||||
upload_handler=validation_upload_handler
|
||||
)
|
||||
|
||||
# Verify that the URL is included in the returned markdown
|
||||
assert "http://test.com/" in result.markdown
|
||||
# Verify that the image markdown format is correct
|
||||
assert re.search(r"!\[.*\]\(http://test\.com/[a-f0-9]{32}\.[a-zA-Z]+\)", result.markdown)
|
||||
|
||||
def test_metadata_completeness():
|
||||
"""Verifies that all required fields are included in the metadata."""
|
||||
metadata_fields = set()
|
||||
|
||||
def metadata_collector(image_blob, meta):
|
||||
nonlocal metadata_fields
|
||||
metadata_fields.update(meta.keys())
|
||||
return "http://test.com"
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as stream:
|
||||
result = MarkItDown().convert(
|
||||
stream,
|
||||
upload_handler=metadata_collector
|
||||
)
|
||||
|
||||
assert "filename" in metadata_fields
|
||||
assert "content_type" in metadata_fields
|
||||
|
||||
|
||||
|
||||
def test_image_content_verification():
|
||||
"""
|
||||
Verifies that the image blob passed to the upload handler matches the original.
|
||||
"""
|
||||
markitdown = MarkItDown()
|
||||
file_path = os.path.join(TEST_FILES_DIR, "test.pptx")
|
||||
|
||||
# Upload handler that verifies the size of the image blob
|
||||
def size_verification_handler(image_blob, meta):
|
||||
# Verify that the image blob contains actual data
|
||||
assert len(image_blob) > 0
|
||||
# Verify that the image blob is in a proper image format
|
||||
# Note: PNG signature is 8 bytes, so slice modification
|
||||
jpeg_sig = b'\xFF\xD8\xFF\xE0' # JPEG
|
||||
png_sig = b'\x89PNG' # PNG (first 4 bytes only)
|
||||
gif_sig = b'GIF8' # GIF
|
||||
|
||||
# Check image signature (first 4 bytes only)
|
||||
img_start = image_blob[:4]
|
||||
valid_sig = False
|
||||
if img_start.startswith(jpeg_sig[:2]) or img_start.startswith(png_sig[:2]) or img_start.startswith(gif_sig[:2]):
|
||||
valid_sig = True
|
||||
|
||||
assert valid_sig, f"Invalid image signature: {img_start}"
|
||||
return "http://test.com/verified.jpg"
|
||||
|
||||
with open(file_path, "rb") as stream: result = markitdown.convert(
|
||||
stream,
|
||||
upload_handler=size_verification_handler
|
||||
)
|
||||
|
||||
assert "http://test.com/verified.jpg" in result.markdown
|
||||
|
||||
def test_concurrent_document_conversion():
|
||||
"""
|
||||
Verifies that the upload_handler works correctly when multiple documents are converted simultaneously.
|
||||
"""
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
results = []
|
||||
exceptions = []
|
||||
|
||||
def convert_document(filename):
|
||||
try:
|
||||
markitdown = MarkItDown()
|
||||
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||
|
||||
thread_id = threading.get_ident()
|
||||
def thread_specific_handler(image_blob, meta):
|
||||
return f"http://test.com/thread{thread_id}/{meta['filename']}"
|
||||
|
||||
with open(file_path, "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream,
|
||||
upload_handler=thread_specific_handler
|
||||
)
|
||||
results.append((filename, result.markdown))
|
||||
|
||||
except Exception as e:
|
||||
exceptions.append((filename, str(e)))
|
||||
|
||||
# Convert the same file in multiple threads simultaneously
|
||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||
for _ in range(3):
|
||||
executor.submit(convert_document, "test.pptx")
|
||||
|
||||
# Verify that all conversions were successful
|
||||
assert len(exceptions) == 0, f"Exceptions occurred during conversion: {exceptions}"
|
||||
assert len(results) == 3, "Expected 3 results, but got a different number"
|
||||
|
||||
# Verify that each result contains a unique thread-specific URL
|
||||
for filename, markdown in results:
|
||||
assert "http://test.com/thread" in markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["test.pptx"])
|
||||
def test_upload_handler_exception_fallback(filename):
|
||||
"""
|
||||
Tests that the conversion process continues and fallback handling is applied
|
||||
when an exception occurs in the upload handler.
|
||||
"""
|
||||
markitdown = MarkItDown()
|
||||
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||
|
||||
# Upload handler that raises an exception
|
||||
def failing_upload_handler(image_blob, meta):
|
||||
raise Exception("Intentional test exception")
|
||||
|
||||
# Conversion should continue even if an exception occurs
|
||||
with open(file_path, "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream,
|
||||
upload_handler=failing_upload_handler
|
||||
)
|
||||
|
||||
# Markdown should be generated
|
||||
assert result.markdown is not None
|
||||
assert len(result.markdown) > 0
|
||||
|
||||
# Default handling (filename) should be applied due to exception handling
|
||||
assert ".jpg" in result.markdown
|
||||
|
||||
@pytest.mark.parametrize("filename", ["test.pptx"])
|
||||
def test_upload_handler_invalid_return_values(filename):
|
||||
"""
|
||||
Tests that various types of invalid return values from the upload handler
|
||||
are handled appropriately.
|
||||
"""
|
||||
markitdown = MarkItDown()
|
||||
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||
|
||||
invalid_return_values = [
|
||||
None, # Return None
|
||||
"", # Empty string
|
||||
" ", # String with only whitespace
|
||||
123, # Number (not a string)
|
||||
[], # Empty list
|
||||
{}, # Empty dictionary
|
||||
False # Boolean value
|
||||
]
|
||||
|
||||
for invalid_value in invalid_return_values:
|
||||
# Handler that returns an invalid value
|
||||
def invalid_return_handler(image_blob, meta):
|
||||
return invalid_value
|
||||
|
||||
with open(file_path, "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream,
|
||||
upload_handler=invalid_return_handler
|
||||
)
|
||||
|
||||
# Markdown should be generated
|
||||
assert result.markdown is not None
|
||||
assert len(result.markdown) > 0
|
||||
|
||||
# Fallback handling should be applied for invalid return values
|
||||
assert ".jpg" in result.markdown
|
||||
|
||||
# Skip empty string validation - markdown text may contain empty strings
|
||||
if isinstance(invalid_value, str) and invalid_value.strip() and len(invalid_value) > 3:
|
||||
assert invalid_value not in result.markdown
|
||||
Loading…
Reference in a new issue