update: changed "method" parameter fro PdfConverter to "pdf_engine" for better user instruction. Add examples for PdfConverter.convert() calling.

2024-12-24 15:11:37 +08:00 · 2024-12-24 15:11:37 +08:00 · ba5df9bbc3
commit ba5df9bbc3
parent 797e0d4071
1 changed files with 9 additions and 4 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -677,18 +677,23 @@ class BingSerpConverter(DocumentConverter):

 class PdfConverter(DocumentConverter):
    """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.    
    """

    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        """
+        Example:
+        >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
+        >>> markitdown.convert(source, pdf_engine="pymupdf4llm")
+        """
        # Bail if not a PDF
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".pdf":
            return None
-        method = kwargs.get("method", "pdfminer")
-        if method == "pdfminer":
+        pdf_engine = kwargs.get("pdf_engine", "pdfminer")
+        if pdf_engine == "pdfminer":
            text_content = pdfminer.high_level.extract_text(local_path)
-        elif method == "pymupdf4llm":
+        elif pdf_engine == "pymupdf4llm":
            text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
        else:
            return None     # unknown method