improve pdf converter to use html

This commit is contained in:
rong-xyz 2025-04-27 02:58:08 +00:00
parent f9510656e0
commit de2b8d7661
3 changed files with 14 additions and 27 deletions

View file

@ -102,6 +102,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if "image" in self.config.modalities:
# Handle data URIs - remove any literal \n characters
if src.startswith("data:"):
# Replace literal \n with empty string
src = src.replace('\\n', '')
# Also remove actual newlines and whitespace
src = src.replace('\n', '').replace('\r', '').strip()
return "![%s](%s%s)" % (alt, src, title_part)
else:
return alt

View file

@ -4,7 +4,7 @@ import base64
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo, Config
from ._html_converter import HtmlConverter
import fitz
@ -15,6 +15,7 @@ class PdfConverter(DocumentConverter):
def __init__(self, config: Config):
self.config = config
self._html_converter = HtmlConverter(config=config)
def convert(
self,
@ -27,38 +28,17 @@ class PdfConverter(DocumentConverter):
# Extract text and images from all pages
markdown_content = ""
image_count = 0
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Get text with the default "text" mode which gives plain text
page_text = page.get_text("text")
page_text = page.get_text("html")
# Add page marker
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
markdown_content += page_text + "\n\n"
# Extract images from the page
image_list = page.get_images(full=True)
if 'image' in self.config.modalities:
for img_index, img_info in enumerate(image_list):
xref = img_info[0] # Get the image reference
base_image = doc.extract_image(xref)
if base_image:
image_bytes = base_image["image"]
image_ext = base_image["ext"]
try:
# Convert image to base64 for markdown embedding
img_base64 = base64.b64encode(
image_bytes).decode('utf-8')
# Add image to markdown with a unique identifier
image_count += 1
markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
except Exception as e:
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
else:
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
html_conterted_md = self._html_converter.convert_string(page_text)
markdown_content += html_conterted_md.markdown
markdown_content += "\n\n"
# Close the document to free resources
doc.close()
return DocumentConverterResult(

Binary file not shown.