diff --git a/pyproject.toml b/pyproject.toml index c5bd58b..c070663 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "pandas", "openpyxl", "pdfminer.six", + "pymupdf4llm", "puremagic", "pydub", "youtube-transcript-api", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 040a586..82fd83a 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -23,6 +23,7 @@ import markdownify import pandas as pd import pdfminer import pdfminer.high_level +import pymupdf4llm import pptx # File-format detection @@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter): if extension.lower() != ".pdf": return None - return DocumentConverterResult( - title=None, - text_content=pdfminer.high_level.extract_text(local_path), - ) + # return DocumentConverterResult( + # title=None, + # text_content=pdfminer.high_level.extract_text(local_path), + # ) + text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) + return DocumentConverterResult(title=None, text_content=text_content) class DocxConverter(HtmlConverter):