From ba5df9bbc3db8703903929cd55e56a334c789e16 Mon Sep 17 00:00:00 2001 From: tungsten106 Date: Tue, 24 Dec 2024 15:11:37 +0800 Subject: [PATCH] update: changed "method" parameter fro PdfConverter to "pdf_engine" for better user instruction. Add examples for PdfConverter.convert() calling. --- src/markitdown/_markitdown.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 147acb8..6614a5a 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -677,18 +677,23 @@ class BingSerpConverter(DocumentConverter): class PdfConverter(DocumentConverter): """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + """ + Example: + >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf" + >>> markitdown.convert(source, pdf_engine="pymupdf4llm") + """ # Bail if not a PDF extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": return None - method = kwargs.get("method", "pdfminer") - if method == "pdfminer": + pdf_engine = kwargs.get("pdf_engine", "pdfminer") + if pdf_engine == "pdfminer": text_content = pdfminer.high_level.extract_text(local_path) - elif method == "pymupdf4llm": + elif pdf_engine == "pymupdf4llm": text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) else: return None # unknown method