diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 087da32..a41be7a 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -4,6 +4,7 @@ import os import io import re import html +import uuid from typing import BinaryIO, Any from operator import attrgetter @@ -139,17 +140,61 @@ class PptxConverter(DocumentConverter): alt_text = "\n".join([llm_description, alt_text]) or shape.name alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() + + uploader = kwargs.get("upload_handler") + keep_data_uris = kwargs.get("keep_data_uris", False) - # If keep_data_uris is True, use base64 encoding for images - if kwargs.get("keep_data_uris", False): + try: blob = shape.image.blob - content_type = shape.image.content_type or "image/png" - b64_string = base64.b64encode(blob).decode("utf-8") - md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" - else: - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += "\n![" + alt_text + "](" + filename + ")\n" + original_ext = os.path.splitext(shape.image.filename)[1] if shape.image.filename else ".jpg" + unique_filename = f"{uuid.uuid4().hex}{original_ext}" + + # Use uploader if available + if uploader: + meta = { + "filename": unique_filename, + "content_type": shape.image.content_type or "image/png", + } + image_url = uploader(blob, meta) + + # Verify if a valid URL was returned + if image_url and isinstance(image_url, str) and image_url.strip(): + md_content += f"\n![{alt_text}]({image_url})\n" + else: + print(f"Warning: Upload handler returned invalid URL for {unique_filename}") + # Fallback if uploader fails + if keep_data_uris: + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + else: + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + "](" + filename + ")\n" + # No uploader but data URI retention is enabled + elif keep_data_uris: + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + # Default case: use filename only + else: + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + "](" + filename + ")\n" + + except Exception as e: + print(f"Error processing image: {str(e)}") + # Default handling on error + if keep_data_uris: + try: + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + except: + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + "](" + filename + ")\n" + else: + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + "](" + filename + ")\n" # Tables if self._is_table(shape): diff --git a/packages/markitdown/tests/test_upload_handler.py b/packages/markitdown/tests/test_upload_handler.py new file mode 100644 index 0000000..0bab572 --- /dev/null +++ b/packages/markitdown/tests/test_upload_handler.py @@ -0,0 +1,201 @@ +import os +import pytest +from markitdown import MarkItDown, StreamInfo +import re + + + + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + +@pytest.mark.parametrize("filename", ["test.pptx"]) +def test_convert_with_upload_handler_url_format(filename): + """Tests if the URL returned by the upload_handler is included in the converted markdown in the correct format when converting a PPTX file.""" + markitdown = MarkItDown() + + file_path = os.path.join(TEST_FILES_DIR, filename) + + + def validation_upload_handler(image_blob, meta): + assert "filename" in meta + assert re.match(r"[a-f0-9]{32}\.[a-zA-Z]+", meta["filename"]) # Check if filename is in UUID format + return f"http://test.com/{meta['filename']}" + + with open(file_path, "rb") as stream: + result = markitdown.convert( + stream, + stream_info=StreamInfo( + extension=".pptx", + mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + upload_handler=validation_upload_handler + ) + + # Verify that the URL is included in the returned markdown + assert "http://test.com/" in result.markdown + # Verify that the image markdown format is correct + assert re.search(r"!\[.*\]\(http://test\.com/[a-f0-9]{32}\.[a-zA-Z]+\)", result.markdown) + +def test_metadata_completeness(): + """Verifies that all required fields are included in the metadata.""" + metadata_fields = set() + + def metadata_collector(image_blob, meta): + nonlocal metadata_fields + metadata_fields.update(meta.keys()) + return "http://test.com" + + with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as stream: + result = MarkItDown().convert( + stream, + upload_handler=metadata_collector + ) + + assert "filename" in metadata_fields + assert "content_type" in metadata_fields + + + +def test_image_content_verification(): + """ + Verifies that the image blob passed to the upload handler matches the original. + """ + markitdown = MarkItDown() + file_path = os.path.join(TEST_FILES_DIR, "test.pptx") + + # Upload handler that verifies the size of the image blob + def size_verification_handler(image_blob, meta): + # Verify that the image blob contains actual data + assert len(image_blob) > 0 + # Verify that the image blob is in a proper image format + # Note: PNG signature is 8 bytes, so slice modification + jpeg_sig = b'\xFF\xD8\xFF\xE0' # JPEG + png_sig = b'\x89PNG' # PNG (first 4 bytes only) + gif_sig = b'GIF8' # GIF + + # Check image signature (first 4 bytes only) + img_start = image_blob[:4] + valid_sig = False + if img_start.startswith(jpeg_sig[:2]) or img_start.startswith(png_sig[:2]) or img_start.startswith(gif_sig[:2]): + valid_sig = True + + assert valid_sig, f"Invalid image signature: {img_start}" + return "http://test.com/verified.jpg" + + with open(file_path, "rb") as stream: result = markitdown.convert( + stream, + upload_handler=size_verification_handler + ) + + assert "http://test.com/verified.jpg" in result.markdown + +def test_concurrent_document_conversion(): + """ + Verifies that the upload_handler works correctly when multiple documents are converted simultaneously. + """ + import threading + from concurrent.futures import ThreadPoolExecutor + + results = [] + exceptions = [] + + def convert_document(filename): + try: + markitdown = MarkItDown() + file_path = os.path.join(TEST_FILES_DIR, filename) + + thread_id = threading.get_ident() + def thread_specific_handler(image_blob, meta): + return f"http://test.com/thread{thread_id}/{meta['filename']}" + + with open(file_path, "rb") as stream: + result = markitdown.convert( + stream, + upload_handler=thread_specific_handler + ) + results.append((filename, result.markdown)) + + except Exception as e: + exceptions.append((filename, str(e))) + + # Convert the same file in multiple threads simultaneously + with ThreadPoolExecutor(max_workers=3) as executor: + for _ in range(3): + executor.submit(convert_document, "test.pptx") + + # Verify that all conversions were successful + assert len(exceptions) == 0, f"Exceptions occurred during conversion: {exceptions}" + assert len(results) == 3, "Expected 3 results, but got a different number" + + # Verify that each result contains a unique thread-specific URL + for filename, markdown in results: + assert "http://test.com/thread" in markdown + + +@pytest.mark.parametrize("filename", ["test.pptx"]) +def test_upload_handler_exception_fallback(filename): + """ + Tests that the conversion process continues and fallback handling is applied + when an exception occurs in the upload handler. + """ + markitdown = MarkItDown() + file_path = os.path.join(TEST_FILES_DIR, filename) + + # Upload handler that raises an exception + def failing_upload_handler(image_blob, meta): + raise Exception("Intentional test exception") + + # Conversion should continue even if an exception occurs + with open(file_path, "rb") as stream: + result = markitdown.convert( + stream, + upload_handler=failing_upload_handler + ) + + # Markdown should be generated + assert result.markdown is not None + assert len(result.markdown) > 0 + + # Default handling (filename) should be applied due to exception handling + assert ".jpg" in result.markdown + +@pytest.mark.parametrize("filename", ["test.pptx"]) +def test_upload_handler_invalid_return_values(filename): + """ + Tests that various types of invalid return values from the upload handler + are handled appropriately. + """ + markitdown = MarkItDown() + file_path = os.path.join(TEST_FILES_DIR, filename) + + invalid_return_values = [ + None, # Return None + "", # Empty string + " ", # String with only whitespace + 123, # Number (not a string) + [], # Empty list + {}, # Empty dictionary + False # Boolean value + ] + + for invalid_value in invalid_return_values: + # Handler that returns an invalid value + def invalid_return_handler(image_blob, meta): + return invalid_value + + with open(file_path, "rb") as stream: + result = markitdown.convert( + stream, + upload_handler=invalid_return_handler + ) + + # Markdown should be generated + assert result.markdown is not None + assert len(result.markdown) > 0 + + # Fallback handling should be applied for invalid return values + assert ".jpg" in result.markdown + + # Skip empty string validation - markdown text may contain empty strings + if isinstance(invalid_value, str) and invalid_value.strip() and len(invalid_value) > 3: + assert invalid_value not in result.markdown