supports images
This commit is contained in:
parent
cc2ec44a4b
commit
cda189b8d0
1 changed files with 44 additions and 4 deletions
|
|
@ -1,14 +1,16 @@
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
import io
|
||||||
|
import base64
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
import pdfminer.high_level
|
import fitz
|
||||||
|
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown with embedded images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
|
|
@ -17,6 +19,44 @@ class PdfConverter(DocumentConverter):
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
|
# Create a document object from the stream
|
||||||
|
doc = fitz.open(stream=file_stream, filetype="pdf")
|
||||||
|
|
||||||
|
# Extract text and images from all pages
|
||||||
|
markdown_content = ""
|
||||||
|
image_count = 0
|
||||||
|
for page_num in range(len(doc)):
|
||||||
|
page = doc.load_page(page_num)
|
||||||
|
|
||||||
|
# Get text with the default "text" mode which gives plain text
|
||||||
|
page_text = page.get_text("text")
|
||||||
|
# Add page marker
|
||||||
|
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
||||||
|
markdown_content += page_text + "\n\n"
|
||||||
|
|
||||||
|
# Extract images from the page
|
||||||
|
image_list = page.get_images(full=True)
|
||||||
|
|
||||||
|
for img_index, img_info in enumerate(image_list):
|
||||||
|
xref = img_info[0] # Get the image reference
|
||||||
|
base_image = doc.extract_image(xref)
|
||||||
|
|
||||||
|
if base_image:
|
||||||
|
image_bytes = base_image["image"]
|
||||||
|
image_ext = base_image["ext"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Convert image to base64 for markdown embedding
|
||||||
|
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||||
|
# Add image to markdown with a unique identifier
|
||||||
|
image_count += 1
|
||||||
|
markdown_content += f"\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||||
|
|
||||||
|
# Close the document to free resources
|
||||||
|
doc.close()
|
||||||
|
print(markdown_content)
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
markdown=markdown_content,
|
||||||
)
|
)
|
||||||
Loading…
Reference in a new issue