improve pdf converter to use html

2025-04-27 02:58:08 +00:00 · 2025-04-27 02:58:08 +00:00 · de2b8d7661
commit de2b8d7661
parent f9510656e0
3 changed files with 14 additions and 27 deletions
--- a/packages/markitup/src/markitup/converters/_markdownify.py
+++ b/packages/markitup/src/markitup/converters/_markdownify.py
@ -102,6 +102,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        if "image" in self.config.modalities:
            # Handle data URIs - remove any literal \n characters
            if src.startswith("data:"):
                # Replace literal \n with empty string
                src = src.replace('\\n', '')
                # Also remove actual newlines and whitespace
                src = src.replace('\n', '').replace('\r', '').strip()
            return "![%s](%s%s)" % (alt, src, title_part)
        else:
            return alt
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -4,7 +4,7 @@ import base64
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._schemas import StreamInfo, Config
-
+from ._html_converter import HtmlConverter
 import fitz
@ -15,6 +15,7 @@ class PdfConverter(DocumentConverter):
    def __init__(self, config: Config):
        self.config = config
        self._html_converter = HtmlConverter(config=config)
    def convert(
        self,
@ -27,38 +28,17 @@ class PdfConverter(DocumentConverter):
        # Extract text and images from all pages
        markdown_content = ""
-        image_count = 0
+
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            # Get text with the default "text" mode which gives plain text
-            page_text = page.get_text("text")
+            page_text = page.get_text("html")
            # Add page marker
            markdown_content += f"\n\n## Page {page_num + 1}\n\n"
-            markdown_content += page_text + "\n\n"
+            html_conterted_md = self._html_converter.convert_string(page_text)
-
+            markdown_content += html_conterted_md.markdown
-            # Extract images from the page
+            markdown_content += "\n\n"
            image_list = page.get_images(full=True)
            if 'image' in self.config.modalities:
                for img_index, img_info in enumerate(image_list):
                    xref = img_info[0]  # Get the image reference
                    base_image = doc.extract_image(xref)
                    if base_image:
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]
                        try:
                            # Convert image to base64 for markdown embedding
                            img_base64 = base64.b64encode(
                                image_bytes).decode('utf-8')
                            # Add image to markdown with a unique identifier
                            image_count += 1
                            markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
                        except Exception as e:
                            markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
            else:
                markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
        # Close the document to free resources
        doc.close()
        return DocumentConverterResult(
--- a/packages/markitup/tests/test_files/deep.pdf
+++ b/packages/markitup/tests/test_files/deep.pdf