update: changed "method" parameter fro PdfConverter to "pdf_engine" for better user instruction. Add examples for PdfConverter.convert() calling.
This commit is contained in:
parent
797e0d4071
commit
ba5df9bbc3
1 changed files with 9 additions and 4 deletions
|
|
@ -677,18 +677,23 @@ class BingSerpConverter(DocumentConverter):
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""
|
||||||
|
Example:
|
||||||
|
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||||
|
>>> markitdown.convert(source, pdf_engine="pymupdf4llm")
|
||||||
|
"""
|
||||||
# Bail if not a PDF
|
# Bail if not a PDF
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
return None
|
return None
|
||||||
method = kwargs.get("method", "pdfminer")
|
pdf_engine = kwargs.get("pdf_engine", "pdfminer")
|
||||||
if method == "pdfminer":
|
if pdf_engine == "pdfminer":
|
||||||
text_content = pdfminer.high_level.extract_text(local_path)
|
text_content = pdfminer.high_level.extract_text(local_path)
|
||||||
elif method == "pymupdf4llm":
|
elif pdf_engine == "pymupdf4llm":
|
||||||
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
|
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
|
||||||
else:
|
else:
|
||||||
return None # unknown method
|
return None # unknown method
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue