update: adding named parameter pdf_engine to .conver(); adding test cases for pdf. Raised exceptions when pdf_engine is not valid.

This commit is contained in:
tungsten106 2024-12-25 15:06:16 +08:00
parent ba5df9bbc3
commit e80854859e
3 changed files with 21 additions and 5 deletions

View file

@ -14,7 +14,7 @@ import tempfile
import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union, Literal
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
@ -680,7 +680,7 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
"""
Example:
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
@ -690,13 +690,13 @@ class PdfConverter(DocumentConverter):
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
return None
pdf_engine = kwargs.get("pdf_engine", "pdfminer")
if pdf_engine == "pdfminer":
text_content = pdfminer.high_level.extract_text(local_path)
elif pdf_engine == "pymupdf4llm":
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
else:
return None # unknown method
# return None # unknown method
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
return DocumentConverterResult(title=None, text_content=text_content)

BIN
tests/test_files/2308.08155v2.pdf vendored Normal file

Binary file not shown.

View file

@ -7,7 +7,8 @@ import pytest
import requests
from warnings import catch_warnings, resetwarnings
import sys
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
from markitdown import MarkItDown
skip_remote = (
@ -299,6 +300,20 @@ def test_markitdown_llm() -> None:
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()
def test_markitdown_pdf() -> None:
markitdown = MarkItDown()
# I test by local pdf, using PDF_TEST_URL may also be fine.
# By pymupdf4llm
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content
# By pdfminer
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
@ -307,3 +322,4 @@ if __name__ == "__main__":
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
test_markitdown_pdf()