update: Addengine_kwargs for customize parameters. Update PdfConverter engines calling method for easier to add more engines. Examples of using engine_kwargs to extract pdf images added

This commit is contained in:
tungsten106 2024-12-26 14:31:41 +08:00
parent e80854859e
commit 565ef052c1
2 changed files with 43 additions and 15 deletions

View file

@ -14,7 +14,7 @@ import tempfile
import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union, Literal
from typing import Any, Dict, List, Optional, Union, Literal, Mapping
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
@ -679,24 +679,35 @@ class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
_engines: Mapping[str, Any] = {
"pdfminer": pdfminer.high_level.extract_text,
"pymupdf4llm": pymupdf4llm.to_markdown,
}
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self,
local_path,
engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer",
engine_kwargs={},
**kwargs,
) -> Union[None, DocumentConverterResult]:
"""
Example:
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
>>> markitdown.convert(source, pdf_engine="pymupdf4llm")
>>> markitdown.convert(source, engine="pymupdf4llm")
"""
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
return None
if pdf_engine == "pdfminer":
text_content = pdfminer.high_level.extract_text(local_path)
elif pdf_engine == "pymupdf4llm":
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
if engine is not None and engine not in self._engines:
raise FileConversionException(
"'pdf_engine' not valid. Please choose between {}.".format(
list(self._engines.keys())
)
)
else:
# return None # unknown method
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
text_content = self._engines[engine](local_path, **engine_kwargs)
return DocumentConverterResult(title=None, text_content=text_content)

View file

@ -7,8 +7,6 @@ import pytest
import requests
from warnings import catch_warnings, resetwarnings
import sys
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
from markitdown import MarkItDown
skip_remote = (
@ -302,16 +300,35 @@ def test_markitdown_llm() -> None:
def test_markitdown_pdf() -> None:
markitdown = MarkItDown()
# I test by local pdf, using PDF_TEST_URL may also be fine.
# By pymupdf4llm
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
engine="pymupdf4llm",
engine_kwargs={"show_progress": False}, # additional kwargs
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content
# By pymupdf4llm and extract images
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
engine="pymupdf4llm",
engine_kwargs={
"show_progress": False,
"write_images": True,
"image_path": "tests/pics",
}, # `write_images` must be True, setting `image_path` for images saving dir.
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content
# By pdfminer
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer"
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content