diff --git a/packages/markitup/src/markitup/converters/_markdownify.py b/packages/markitup/src/markitup/converters/_markdownify.py index 679d9fd..1af03be 100644 --- a/packages/markitup/src/markitup/converters/_markdownify.py +++ b/packages/markitup/src/markitup/converters/_markdownify.py @@ -102,6 +102,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): title_part = ' "%s"' % title.replace('"', r"\"") if title else "" if "image" in self.config.modalities: + # Handle data URIs - remove any literal \n characters + if src.startswith("data:"): + # Replace literal \n with empty string + src = src.replace('\\n', '') + # Also remove actual newlines and whitespace + src = src.replace('\n', '').replace('\r', '').strip() + return "![%s](%s%s)" % (alt, src, title_part) else: return alt diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py index dc1f817..a825801 100644 --- a/packages/markitup/src/markitup/converters/_pdf_converter.py +++ b/packages/markitup/src/markitup/converters/_pdf_converter.py @@ -4,7 +4,7 @@ import base64 from .._base_converter import DocumentConverter, DocumentConverterResult from .._schemas import StreamInfo, Config - +from ._html_converter import HtmlConverter import fitz @@ -15,6 +15,7 @@ class PdfConverter(DocumentConverter): def __init__(self, config: Config): self.config = config + self._html_converter = HtmlConverter(config=config) def convert( self, @@ -27,38 +28,17 @@ class PdfConverter(DocumentConverter): # Extract text and images from all pages markdown_content = "" - image_count = 0 + for page_num in range(len(doc)): page = doc.load_page(page_num) # Get text with the default "text" mode which gives plain text - page_text = page.get_text("text") + page_text = page.get_text("html") # Add page marker markdown_content += f"\n\n## Page {page_num + 1}\n\n" - markdown_content += page_text + "\n\n" - - # Extract images from the page - image_list = page.get_images(full=True) - if 'image' in self.config.modalities: - for img_index, img_info in enumerate(image_list): - xref = img_info[0] # Get the image reference - base_image = doc.extract_image(xref) - - if base_image: - image_bytes = base_image["image"] - image_ext = base_image["ext"] - - try: - # Convert image to base64 for markdown embedding - img_base64 = base64.b64encode( - image_bytes).decode('utf-8') - # Add image to markdown with a unique identifier - image_count += 1 - markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" - except Exception as e: - markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" - else: - markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n" + html_conterted_md = self._html_converter.convert_string(page_text) + markdown_content += html_conterted_md.markdown + markdown_content += "\n\n" # Close the document to free resources doc.close() return DocumentConverterResult( diff --git a/packages/markitup/tests/test_files/deep.pdf b/packages/markitup/tests/test_files/deep.pdf new file mode 100644 index 0000000..b253eb2 Binary files /dev/null and b/packages/markitup/tests/test_files/deep.pdf differ