update: adding named parameter pdf_engine to .conver(); adding test cases for pdf. Raised exceptions when pdf_engine is not valid.
This commit is contained in:
parent
ba5df9bbc3
commit
e80854859e
3 changed files with 21 additions and 5 deletions
|
|
@ -14,7 +14,7 @@ import tempfile
|
|||
import traceback
|
||||
import zipfile
|
||||
from xml.dom import minidom
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union, Literal
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
|
|
@ -680,7 +680,7 @@ class PdfConverter(DocumentConverter):
|
|||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
"""
|
||||
Example:
|
||||
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||
|
|
@ -690,13 +690,13 @@ class PdfConverter(DocumentConverter):
|
|||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".pdf":
|
||||
return None
|
||||
pdf_engine = kwargs.get("pdf_engine", "pdfminer")
|
||||
if pdf_engine == "pdfminer":
|
||||
text_content = pdfminer.high_level.extract_text(local_path)
|
||||
elif pdf_engine == "pymupdf4llm":
|
||||
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
|
||||
else:
|
||||
return None # unknown method
|
||||
# return None # unknown method
|
||||
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
|
||||
return DocumentConverterResult(title=None, text_content=text_content)
|
||||
|
||||
|
||||
|
|
|
|||
BIN
tests/test_files/2308.08155v2.pdf
vendored
Normal file
BIN
tests/test_files/2308.08155v2.pdf
vendored
Normal file
Binary file not shown.
|
|
@ -7,7 +7,8 @@ import pytest
|
|||
import requests
|
||||
|
||||
from warnings import catch_warnings, resetwarnings
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
|
||||
from markitdown import MarkItDown
|
||||
|
||||
skip_remote = (
|
||||
|
|
@ -299,6 +300,20 @@ def test_markitdown_llm() -> None:
|
|||
for test_string in ["red", "circle", "blue", "square"]:
|
||||
assert test_string in result.text_content.lower()
|
||||
|
||||
def test_markitdown_pdf() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# I test by local pdf, using PDF_TEST_URL may also be fine.
|
||||
|
||||
# By pymupdf4llm
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
|
||||
for test_string in PDF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
# By pdfminer
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
|
||||
for test_string in PDF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
|
|
@ -307,3 +322,4 @@ if __name__ == "__main__":
|
|||
test_markitdown_exiftool()
|
||||
test_markitdown_deprecation()
|
||||
test_markitdown_llm()
|
||||
test_markitdown_pdf()
|
||||
|
|
|
|||
Loading…
Reference in a new issue