diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 6614a5a..96de6ea 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -14,7 +14,7 @@ import tempfile import traceback import zipfile from xml.dom import minidom -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Literal from pathlib import Path from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import warn, resetwarnings, catch_warnings @@ -680,7 +680,7 @@ class PdfConverter(DocumentConverter): Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]: """ Example: >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf" @@ -690,13 +690,13 @@ class PdfConverter(DocumentConverter): extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": return None - pdf_engine = kwargs.get("pdf_engine", "pdfminer") if pdf_engine == "pdfminer": text_content = pdfminer.high_level.extract_text(local_path) elif pdf_engine == "pymupdf4llm": text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) else: - return None # unknown method + # return None # unknown method + raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].") return DocumentConverterResult(title=None, text_content=text_content) diff --git a/tests/test_files/2308.08155v2.pdf b/tests/test_files/2308.08155v2.pdf new file mode 100644 index 0000000..fffb9dd Binary files /dev/null and b/tests/test_files/2308.08155v2.pdf differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..52d7391 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -7,7 +7,8 @@ import pytest import requests from warnings import catch_warnings, resetwarnings - +import sys +sys.path.insert(0, "/home/yxl/Projects/markitdown/src") from markitdown import MarkItDown skip_remote = ( @@ -299,6 +300,20 @@ def test_markitdown_llm() -> None: for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() +def test_markitdown_pdf() -> None: + markitdown = MarkItDown() + + # I test by local pdf, using PDF_TEST_URL may also be fine. + + # By pymupdf4llm + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm") + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content + + # By pdfminer + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer") + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content if __name__ == "__main__": """Runs this file's tests from the command line.""" @@ -307,3 +322,4 @@ if __name__ == "__main__": test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_pdf()