update: Addengine_kwargs for customize parameters. Update PdfConverter engines calling method for easier to add more engines. Examples of using engine_kwargs to extract pdf images added
This commit is contained in:
parent
e80854859e
commit
565ef052c1
2 changed files with 43 additions and 15 deletions
|
|
@ -14,7 +14,7 @@ import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union, Literal
|
from typing import Any, Dict, List, Optional, Union, Literal, Mapping
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import warn, resetwarnings, catch_warnings
|
||||||
|
|
@ -679,24 +679,35 @@ class PdfConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||||
"""
|
"""
|
||||||
|
_engines: Mapping[str, Any] = {
|
||||||
|
"pdfminer": pdfminer.high_level.extract_text,
|
||||||
|
"pymupdf4llm": pymupdf4llm.to_markdown,
|
||||||
|
}
|
||||||
|
|
||||||
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
|
self,
|
||||||
|
local_path,
|
||||||
|
engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer",
|
||||||
|
engine_kwargs={},
|
||||||
|
**kwargs,
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
"""
|
"""
|
||||||
Example:
|
Example:
|
||||||
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||||
>>> markitdown.convert(source, pdf_engine="pymupdf4llm")
|
>>> markitdown.convert(source, engine="pymupdf4llm")
|
||||||
"""
|
"""
|
||||||
# Bail if not a PDF
|
# Bail if not a PDF
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
return None
|
return None
|
||||||
if pdf_engine == "pdfminer":
|
if engine is not None and engine not in self._engines:
|
||||||
text_content = pdfminer.high_level.extract_text(local_path)
|
raise FileConversionException(
|
||||||
elif pdf_engine == "pymupdf4llm":
|
"'pdf_engine' not valid. Please choose between {}.".format(
|
||||||
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
|
list(self._engines.keys())
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# return None # unknown method
|
text_content = self._engines[engine](local_path, **engine_kwargs)
|
||||||
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
|
|
||||||
return DocumentConverterResult(title=None, text_content=text_content)
|
return DocumentConverterResult(title=None, text_content=text_content)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,6 @@ import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from warnings import catch_warnings, resetwarnings
|
from warnings import catch_warnings, resetwarnings
|
||||||
import sys
|
|
||||||
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
|
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
|
|
@ -306,12 +304,31 @@ def test_markitdown_pdf() -> None:
|
||||||
# I test by local pdf, using PDF_TEST_URL may also be fine.
|
# I test by local pdf, using PDF_TEST_URL may also be fine.
|
||||||
|
|
||||||
# By pymupdf4llm
|
# By pymupdf4llm
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
|
result = markitdown.convert(
|
||||||
|
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
|
||||||
|
engine="pymupdf4llm",
|
||||||
|
engine_kwargs={"show_progress": False}, # additional kwargs
|
||||||
|
)
|
||||||
|
for test_string in PDF_TEST_STRINGS:
|
||||||
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
# By pymupdf4llm and extract images
|
||||||
|
result = markitdown.convert(
|
||||||
|
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
|
||||||
|
engine="pymupdf4llm",
|
||||||
|
engine_kwargs={
|
||||||
|
"show_progress": False,
|
||||||
|
"write_images": True,
|
||||||
|
"image_path": "tests/pics",
|
||||||
|
}, # `write_images` must be True, setting `image_path` for images saving dir.
|
||||||
|
)
|
||||||
for test_string in PDF_TEST_STRINGS:
|
for test_string in PDF_TEST_STRINGS:
|
||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
# By pdfminer
|
# By pdfminer
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
|
result = markitdown.convert(
|
||||||
|
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer"
|
||||||
|
)
|
||||||
for test_string in PDF_TEST_STRINGS:
|
for test_string in PDF_TEST_STRINGS:
|
||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue