update: Addengine_kwargs for customize parameters. Update PdfConverter engines calling method for easier to add more engines. Examples of using engine_kwargs to extract pdf images added

This commit is contained in:
tungsten106 2024-12-26 14:31:41 +08:00
parent e80854859e
commit 565ef052c1
2 changed files with 43 additions and 15 deletions

View file

@ -14,7 +14,7 @@ import tempfile
import traceback import traceback
import zipfile import zipfile
from xml.dom import minidom from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union, Literal from typing import Any, Dict, List, Optional, Union, Literal, Mapping
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings from warnings import warn, resetwarnings, catch_warnings
@ -679,24 +679,35 @@ class PdfConverter(DocumentConverter):
""" """
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
""" """
_engines: Mapping[str, Any] = {
"pdfminer": pdfminer.high_level.extract_text,
"pymupdf4llm": pymupdf4llm.to_markdown,
}
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self,
local_path,
engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer",
engine_kwargs={},
**kwargs,
) -> Union[None, DocumentConverterResult]:
""" """
Example: Example:
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf" >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
>>> markitdown.convert(source, pdf_engine="pymupdf4llm") >>> markitdown.convert(source, engine="pymupdf4llm")
""" """
# Bail if not a PDF # Bail if not a PDF
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf": if extension.lower() != ".pdf":
return None return None
if pdf_engine == "pdfminer": if engine is not None and engine not in self._engines:
text_content = pdfminer.high_level.extract_text(local_path) raise FileConversionException(
elif pdf_engine == "pymupdf4llm": "'pdf_engine' not valid. Please choose between {}.".format(
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) list(self._engines.keys())
)
)
else: else:
# return None # unknown method text_content = self._engines[engine](local_path, **engine_kwargs)
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
return DocumentConverterResult(title=None, text_content=text_content) return DocumentConverterResult(title=None, text_content=text_content)

View file

@ -7,8 +7,6 @@ import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings from warnings import catch_warnings, resetwarnings
import sys
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
@ -306,12 +304,31 @@ def test_markitdown_pdf() -> None:
# I test by local pdf, using PDF_TEST_URL may also be fine. # I test by local pdf, using PDF_TEST_URL may also be fine.
# By pymupdf4llm # By pymupdf4llm
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm") result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
engine="pymupdf4llm",
engine_kwargs={"show_progress": False}, # additional kwargs
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content
# By pymupdf4llm and extract images
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
engine="pymupdf4llm",
engine_kwargs={
"show_progress": False,
"write_images": True,
"image_path": "tests/pics",
}, # `write_images` must be True, setting `image_path` for images saving dir.
)
for test_string in PDF_TEST_STRINGS: for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content assert test_string in result.text_content
# By pdfminer # By pdfminer
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer") result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer"
)
for test_string in PDF_TEST_STRINGS: for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content assert test_string in result.text_content