supports images

2025-04-21 12:41:33 +00:00 · 2025-04-21 12:41:33 +00:00 · cda189b8d0
commit cda189b8d0
parent cc2ec44a4b
1 changed files with 44 additions and 4 deletions
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -1,14 +1,16 @@
 from typing import BinaryIO, Any
+import io
+import base64

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo

-import pdfminer.high_level
+import fitz


 class PdfConverter(DocumentConverter):
    """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+    Converts PDFs to Markdown with embedded images.
    """

    def convert(
@ -17,6 +19,44 @@ class PdfConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
+        # Create a document object from the stream
+        doc = fitz.open(stream=file_stream, filetype="pdf")
+        
+        # Extract text and images from all pages
+        markdown_content = ""
+        image_count = 0
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            
+            # Get text with the default "text" mode which gives plain text
+            page_text = page.get_text("text")
+            # Add page marker
+            markdown_content += f"\n\n## Page {page_num + 1}\n\n"
+            markdown_content += page_text + "\n\n"
+            
+            # Extract images from the page
+            image_list = page.get_images(full=True)
+            
+            for img_index, img_info in enumerate(image_list):
+                xref = img_info[0]  # Get the image reference
+                base_image = doc.extract_image(xref)
+                
+                if base_image:
+                    image_bytes = base_image["image"]
+                    image_ext = base_image["ext"]
+                    
+                    try:
+                        # Convert image to base64 for markdown embedding
+                        img_base64 = base64.b64encode(image_bytes).decode('utf-8')
+                        # Add image to markdown with a unique identifier
+                        image_count += 1
+                        markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
+                    except Exception as e:
+                        markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
+        
+        # Close the document to free resources
+        doc.close()
+        print(markdown_content)
        return DocumentConverterResult(
-            markdown=pdfminer.high_level.extract_text(file_stream),
+            markdown=markdown_content,
        )