feat:Adding image upload handler functionality to PPTX converter
* Add handler to upload images extracted from PPTX to external storage * Generate unique filenames based on UUID to prevent image filename collisions * Implement fallback handling logic for upload failures * Add unit tests
This commit is contained in:
parent
041be54471
commit
03e852bd5b
2 changed files with 255 additions and 9 deletions
|
|
@ -4,6 +4,7 @@ import os
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import html
|
import html
|
||||||
|
import uuid
|
||||||
|
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
|
|
@ -140,16 +141,60 @@ class PptxConverter(DocumentConverter):
|
||||||
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# If keep_data_uris is True, use base64 encoding for images
|
uploader = kwargs.get("upload_handler")
|
||||||
if kwargs.get("keep_data_uris", False):
|
keep_data_uris = kwargs.get("keep_data_uris", False)
|
||||||
|
|
||||||
|
try:
|
||||||
blob = shape.image.blob
|
blob = shape.image.blob
|
||||||
content_type = shape.image.content_type or "image/png"
|
original_ext = os.path.splitext(shape.image.filename)[1] if shape.image.filename else ".jpg"
|
||||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
unique_filename = f"{uuid.uuid4().hex}{original_ext}"
|
||||||
md_content += f"\n\n"
|
|
||||||
else:
|
# Use uploader if available
|
||||||
# A placeholder name
|
if uploader:
|
||||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
meta = {
|
||||||
md_content += "\n\n"
|
"filename": unique_filename,
|
||||||
|
"content_type": shape.image.content_type or "image/png",
|
||||||
|
}
|
||||||
|
image_url = uploader(blob, meta)
|
||||||
|
|
||||||
|
# Verify if a valid URL was returned
|
||||||
|
if image_url and isinstance(image_url, str) and image_url.strip():
|
||||||
|
md_content += f"\n\n"
|
||||||
|
else:
|
||||||
|
print(f"Warning: Upload handler returned invalid URL for {unique_filename}")
|
||||||
|
# Fallback if uploader fails
|
||||||
|
if keep_data_uris:
|
||||||
|
content_type = shape.image.content_type or "image/png"
|
||||||
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
|
md_content += f"\n\n"
|
||||||
|
else:
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
# No uploader but data URI retention is enabled
|
||||||
|
elif keep_data_uris:
|
||||||
|
content_type = shape.image.content_type or "image/png"
|
||||||
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
|
md_content += f"\n\n"
|
||||||
|
# Default case: use filename only
|
||||||
|
else:
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing image: {str(e)}")
|
||||||
|
# Default handling on error
|
||||||
|
if keep_data_uris:
|
||||||
|
try:
|
||||||
|
blob = shape.image.blob
|
||||||
|
content_type = shape.image.content_type or "image/png"
|
||||||
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
|
md_content += f"\n\n"
|
||||||
|
except:
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
else:
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
|
|
|
||||||
201
packages/markitdown/tests/test_upload_handler.py
Normal file
201
packages/markitdown/tests/test_upload_handler.py
Normal file
|
|
@ -0,0 +1,201 @@
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from markitdown import MarkItDown, StreamInfo
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("filename", ["test.pptx"])
|
||||||
|
def test_convert_with_upload_handler_url_format(filename):
|
||||||
|
"""Tests if the URL returned by the upload_handler is included in the converted markdown in the correct format when converting a PPTX file."""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||||
|
|
||||||
|
|
||||||
|
def validation_upload_handler(image_blob, meta):
|
||||||
|
assert "filename" in meta
|
||||||
|
assert re.match(r"[a-f0-9]{32}\.[a-zA-Z]+", meta["filename"]) # Check if filename is in UUID format
|
||||||
|
return f"http://test.com/{meta['filename']}"
|
||||||
|
|
||||||
|
with open(file_path, "rb") as stream:
|
||||||
|
result = markitdown.convert(
|
||||||
|
stream,
|
||||||
|
stream_info=StreamInfo(
|
||||||
|
extension=".pptx",
|
||||||
|
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
),
|
||||||
|
upload_handler=validation_upload_handler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify that the URL is included in the returned markdown
|
||||||
|
assert "http://test.com/" in result.markdown
|
||||||
|
# Verify that the image markdown format is correct
|
||||||
|
assert re.search(r"!\[.*\]\(http://test\.com/[a-f0-9]{32}\.[a-zA-Z]+\)", result.markdown)
|
||||||
|
|
||||||
|
def test_metadata_completeness():
|
||||||
|
"""Verifies that all required fields are included in the metadata."""
|
||||||
|
metadata_fields = set()
|
||||||
|
|
||||||
|
def metadata_collector(image_blob, meta):
|
||||||
|
nonlocal metadata_fields
|
||||||
|
metadata_fields.update(meta.keys())
|
||||||
|
return "http://test.com"
|
||||||
|
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as stream:
|
||||||
|
result = MarkItDown().convert(
|
||||||
|
stream,
|
||||||
|
upload_handler=metadata_collector
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "filename" in metadata_fields
|
||||||
|
assert "content_type" in metadata_fields
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_image_content_verification():
|
||||||
|
"""
|
||||||
|
Verifies that the image blob passed to the upload handler matches the original.
|
||||||
|
"""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
file_path = os.path.join(TEST_FILES_DIR, "test.pptx")
|
||||||
|
|
||||||
|
# Upload handler that verifies the size of the image blob
|
||||||
|
def size_verification_handler(image_blob, meta):
|
||||||
|
# Verify that the image blob contains actual data
|
||||||
|
assert len(image_blob) > 0
|
||||||
|
# Verify that the image blob is in a proper image format
|
||||||
|
# Note: PNG signature is 8 bytes, so slice modification
|
||||||
|
jpeg_sig = b'\xFF\xD8\xFF\xE0' # JPEG
|
||||||
|
png_sig = b'\x89PNG' # PNG (first 4 bytes only)
|
||||||
|
gif_sig = b'GIF8' # GIF
|
||||||
|
|
||||||
|
# Check image signature (first 4 bytes only)
|
||||||
|
img_start = image_blob[:4]
|
||||||
|
valid_sig = False
|
||||||
|
if img_start.startswith(jpeg_sig[:2]) or img_start.startswith(png_sig[:2]) or img_start.startswith(gif_sig[:2]):
|
||||||
|
valid_sig = True
|
||||||
|
|
||||||
|
assert valid_sig, f"Invalid image signature: {img_start}"
|
||||||
|
return "http://test.com/verified.jpg"
|
||||||
|
|
||||||
|
with open(file_path, "rb") as stream: result = markitdown.convert(
|
||||||
|
stream,
|
||||||
|
upload_handler=size_verification_handler
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "http://test.com/verified.jpg" in result.markdown
|
||||||
|
|
||||||
|
def test_concurrent_document_conversion():
|
||||||
|
"""
|
||||||
|
Verifies that the upload_handler works correctly when multiple documents are converted simultaneously.
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
results = []
|
||||||
|
exceptions = []
|
||||||
|
|
||||||
|
def convert_document(filename):
|
||||||
|
try:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||||
|
|
||||||
|
thread_id = threading.get_ident()
|
||||||
|
def thread_specific_handler(image_blob, meta):
|
||||||
|
return f"http://test.com/thread{thread_id}/{meta['filename']}"
|
||||||
|
|
||||||
|
with open(file_path, "rb") as stream:
|
||||||
|
result = markitdown.convert(
|
||||||
|
stream,
|
||||||
|
upload_handler=thread_specific_handler
|
||||||
|
)
|
||||||
|
results.append((filename, result.markdown))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
exceptions.append((filename, str(e)))
|
||||||
|
|
||||||
|
# Convert the same file in multiple threads simultaneously
|
||||||
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||||
|
for _ in range(3):
|
||||||
|
executor.submit(convert_document, "test.pptx")
|
||||||
|
|
||||||
|
# Verify that all conversions were successful
|
||||||
|
assert len(exceptions) == 0, f"Exceptions occurred during conversion: {exceptions}"
|
||||||
|
assert len(results) == 3, "Expected 3 results, but got a different number"
|
||||||
|
|
||||||
|
# Verify that each result contains a unique thread-specific URL
|
||||||
|
for filename, markdown in results:
|
||||||
|
assert "http://test.com/thread" in markdown
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("filename", ["test.pptx"])
|
||||||
|
def test_upload_handler_exception_fallback(filename):
|
||||||
|
"""
|
||||||
|
Tests that the conversion process continues and fallback handling is applied
|
||||||
|
when an exception occurs in the upload handler.
|
||||||
|
"""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||||
|
|
||||||
|
# Upload handler that raises an exception
|
||||||
|
def failing_upload_handler(image_blob, meta):
|
||||||
|
raise Exception("Intentional test exception")
|
||||||
|
|
||||||
|
# Conversion should continue even if an exception occurs
|
||||||
|
with open(file_path, "rb") as stream:
|
||||||
|
result = markitdown.convert(
|
||||||
|
stream,
|
||||||
|
upload_handler=failing_upload_handler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Markdown should be generated
|
||||||
|
assert result.markdown is not None
|
||||||
|
assert len(result.markdown) > 0
|
||||||
|
|
||||||
|
# Default handling (filename) should be applied due to exception handling
|
||||||
|
assert ".jpg" in result.markdown
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("filename", ["test.pptx"])
|
||||||
|
def test_upload_handler_invalid_return_values(filename):
|
||||||
|
"""
|
||||||
|
Tests that various types of invalid return values from the upload handler
|
||||||
|
are handled appropriately.
|
||||||
|
"""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
file_path = os.path.join(TEST_FILES_DIR, filename)
|
||||||
|
|
||||||
|
invalid_return_values = [
|
||||||
|
None, # Return None
|
||||||
|
"", # Empty string
|
||||||
|
" ", # String with only whitespace
|
||||||
|
123, # Number (not a string)
|
||||||
|
[], # Empty list
|
||||||
|
{}, # Empty dictionary
|
||||||
|
False # Boolean value
|
||||||
|
]
|
||||||
|
|
||||||
|
for invalid_value in invalid_return_values:
|
||||||
|
# Handler that returns an invalid value
|
||||||
|
def invalid_return_handler(image_blob, meta):
|
||||||
|
return invalid_value
|
||||||
|
|
||||||
|
with open(file_path, "rb") as stream:
|
||||||
|
result = markitdown.convert(
|
||||||
|
stream,
|
||||||
|
upload_handler=invalid_return_handler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Markdown should be generated
|
||||||
|
assert result.markdown is not None
|
||||||
|
assert len(result.markdown) > 0
|
||||||
|
|
||||||
|
# Fallback handling should be applied for invalid return values
|
||||||
|
assert ".jpg" in result.markdown
|
||||||
|
|
||||||
|
# Skip empty string validation - markdown text may contain empty strings
|
||||||
|
if isinstance(invalid_value, str) and invalid_value.strip() and len(invalid_value) > 3:
|
||||||
|
assert invalid_value not in result.markdown
|
||||||
Loading…
Reference in a new issue