improve pdf converter to use html
This commit is contained in:
parent
f9510656e0
commit
de2b8d7661
3 changed files with 14 additions and 27 deletions
|
|
@ -102,6 +102,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||
|
||||
if "image" in self.config.modalities:
|
||||
# Handle data URIs - remove any literal \n characters
|
||||
if src.startswith("data:"):
|
||||
# Replace literal \n with empty string
|
||||
src = src.replace('\\n', '')
|
||||
# Also remove actual newlines and whitespace
|
||||
src = src.replace('\n', '').replace('\r', '').strip()
|
||||
|
||||
return "" % (alt, src, title_part)
|
||||
else:
|
||||
return alt
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import base64
|
|||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo, Config
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
import fitz
|
||||
|
||||
|
||||
|
|
@ -15,6 +15,7 @@ class PdfConverter(DocumentConverter):
|
|||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
self._html_converter = HtmlConverter(config=config)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
|
|
@ -27,38 +28,17 @@ class PdfConverter(DocumentConverter):
|
|||
|
||||
# Extract text and images from all pages
|
||||
markdown_content = ""
|
||||
image_count = 0
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc.load_page(page_num)
|
||||
|
||||
# Get text with the default "text" mode which gives plain text
|
||||
page_text = page.get_text("text")
|
||||
page_text = page.get_text("html")
|
||||
# Add page marker
|
||||
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
||||
markdown_content += page_text + "\n\n"
|
||||
|
||||
# Extract images from the page
|
||||
image_list = page.get_images(full=True)
|
||||
if 'image' in self.config.modalities:
|
||||
for img_index, img_info in enumerate(image_list):
|
||||
xref = img_info[0] # Get the image reference
|
||||
base_image = doc.extract_image(xref)
|
||||
|
||||
if base_image:
|
||||
image_bytes = base_image["image"]
|
||||
image_ext = base_image["ext"]
|
||||
|
||||
try:
|
||||
# Convert image to base64 for markdown embedding
|
||||
img_base64 = base64.b64encode(
|
||||
image_bytes).decode('utf-8')
|
||||
# Add image to markdown with a unique identifier
|
||||
image_count += 1
|
||||
markdown_content += f"\n\n"
|
||||
except Exception as e:
|
||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||
else:
|
||||
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
|
||||
html_conterted_md = self._html_converter.convert_string(page_text)
|
||||
markdown_content += html_conterted_md.markdown
|
||||
markdown_content += "\n\n"
|
||||
# Close the document to free resources
|
||||
doc.close()
|
||||
return DocumentConverterResult(
|
||||
|
|
|
|||
BIN
packages/markitup/tests/test_files/deep.pdf
Normal file
BIN
packages/markitup/tests/test_files/deep.pdf
Normal file
Binary file not shown.
Loading…
Reference in a new issue