improve pdf converter to use html
This commit is contained in:
parent
f9510656e0
commit
de2b8d7661
3 changed files with 14 additions and 27 deletions
|
|
@ -102,6 +102,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
|
|
||||||
if "image" in self.config.modalities:
|
if "image" in self.config.modalities:
|
||||||
|
# Handle data URIs - remove any literal \n characters
|
||||||
|
if src.startswith("data:"):
|
||||||
|
# Replace literal \n with empty string
|
||||||
|
src = src.replace('\\n', '')
|
||||||
|
# Also remove actual newlines and whitespace
|
||||||
|
src = src.replace('\n', '').replace('\r', '').strip()
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, src, title_part)
|
||||||
else:
|
else:
|
||||||
return alt
|
return alt
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import base64
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._schemas import StreamInfo, Config
|
from .._schemas import StreamInfo, Config
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -15,6 +15,7 @@ class PdfConverter(DocumentConverter):
|
||||||
|
|
||||||
def __init__(self, config: Config):
|
def __init__(self, config: Config):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self._html_converter = HtmlConverter(config=config)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -27,38 +28,17 @@ class PdfConverter(DocumentConverter):
|
||||||
|
|
||||||
# Extract text and images from all pages
|
# Extract text and images from all pages
|
||||||
markdown_content = ""
|
markdown_content = ""
|
||||||
image_count = 0
|
|
||||||
for page_num in range(len(doc)):
|
for page_num in range(len(doc)):
|
||||||
page = doc.load_page(page_num)
|
page = doc.load_page(page_num)
|
||||||
|
|
||||||
# Get text with the default "text" mode which gives plain text
|
# Get text with the default "text" mode which gives plain text
|
||||||
page_text = page.get_text("text")
|
page_text = page.get_text("html")
|
||||||
# Add page marker
|
# Add page marker
|
||||||
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
||||||
markdown_content += page_text + "\n\n"
|
html_conterted_md = self._html_converter.convert_string(page_text)
|
||||||
|
markdown_content += html_conterted_md.markdown
|
||||||
# Extract images from the page
|
markdown_content += "\n\n"
|
||||||
image_list = page.get_images(full=True)
|
|
||||||
if 'image' in self.config.modalities:
|
|
||||||
for img_index, img_info in enumerate(image_list):
|
|
||||||
xref = img_info[0] # Get the image reference
|
|
||||||
base_image = doc.extract_image(xref)
|
|
||||||
|
|
||||||
if base_image:
|
|
||||||
image_bytes = base_image["image"]
|
|
||||||
image_ext = base_image["ext"]
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Convert image to base64 for markdown embedding
|
|
||||||
img_base64 = base64.b64encode(
|
|
||||||
image_bytes).decode('utf-8')
|
|
||||||
# Add image to markdown with a unique identifier
|
|
||||||
image_count += 1
|
|
||||||
markdown_content += f"\n\n"
|
|
||||||
except Exception as e:
|
|
||||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
|
||||||
else:
|
|
||||||
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
|
|
||||||
# Close the document to free resources
|
# Close the document to free resources
|
||||||
doc.close()
|
doc.close()
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
|
|
||||||
BIN
packages/markitup/tests/test_files/deep.pdf
Normal file
BIN
packages/markitup/tests/test_files/deep.pdf
Normal file
Binary file not shown.
Loading…
Reference in a new issue