diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96de6ea..18ad854 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -14,7 +14,7 @@ import tempfile import traceback import zipfile from xml.dom import minidom -from typing import Any, Dict, List, Optional, Union, Literal +from typing import Any, Dict, List, Optional, Union, Literal, Mapping from pathlib import Path from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import warn, resetwarnings, catch_warnings @@ -679,24 +679,35 @@ class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ + _engines: Mapping[str, Any] = { + "pdfminer": pdfminer.high_level.extract_text, + "pymupdf4llm": pymupdf4llm.to_markdown, + } - def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, + local_path, + engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer", + engine_kwargs={}, + **kwargs, + ) -> Union[None, DocumentConverterResult]: """ Example: >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf" - >>> markitdown.convert(source, pdf_engine="pymupdf4llm") + >>> markitdown.convert(source, engine="pymupdf4llm") """ # Bail if not a PDF extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": return None - if pdf_engine == "pdfminer": - text_content = pdfminer.high_level.extract_text(local_path) - elif pdf_engine == "pymupdf4llm": - text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) + if engine is not None and engine not in self._engines: + raise FileConversionException( + "'pdf_engine' not valid. Please choose between {}.".format( + list(self._engines.keys()) + ) + ) else: - # return None # unknown method - raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].") + text_content = self._engines[engine](local_path, **engine_kwargs) return DocumentConverterResult(title=None, text_content=text_content) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 52d7391..f05d049 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -7,8 +7,6 @@ import pytest import requests from warnings import catch_warnings, resetwarnings -import sys -sys.path.insert(0, "/home/yxl/Projects/markitdown/src") from markitdown import MarkItDown skip_remote = ( @@ -302,16 +300,35 @@ def test_markitdown_llm() -> None: def test_markitdown_pdf() -> None: markitdown = MarkItDown() - + # I test by local pdf, using PDF_TEST_URL may also be fine. - + # By pymupdf4llm - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm") + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), + engine="pymupdf4llm", + engine_kwargs={"show_progress": False}, # additional kwargs + ) + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content + + # By pymupdf4llm and extract images + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), + engine="pymupdf4llm", + engine_kwargs={ + "show_progress": False, + "write_images": True, + "image_path": "tests/pics", + }, # `write_images` must be True, setting `image_path` for images saving dir. + ) for test_string in PDF_TEST_STRINGS: assert test_string in result.text_content # By pdfminer - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer") + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer" + ) for test_string in PDF_TEST_STRINGS: assert test_string in result.text_content