From b3f7e00112443b361f938aa73cdaa07c2dc20972 Mon Sep 17 00:00:00 2001 From: tungsten106 Date: Thu, 19 Dec 2024 16:36:05 +0800 Subject: [PATCH] update: change pdf text parser to pymupdf4llm --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c5bd58b..c070663 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "pandas", "openpyxl", "pdfminer.six", + "pymupdf4llm", "puremagic", "pydub", "youtube-transcript-api", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 040a586..82fd83a 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -23,6 +23,7 @@ import markdownify import pandas as pd import pdfminer import pdfminer.high_level +import pymupdf4llm import pptx # File-format detection @@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter): if extension.lower() != ".pdf": return None - return DocumentConverterResult( - title=None, - text_content=pdfminer.high_level.extract_text(local_path), - ) + # return DocumentConverterResult( + # title=None, + # text_content=pdfminer.high_level.extract_text(local_path), + # ) + text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) + return DocumentConverterResult(title=None, text_content=text_content) class DocxConverter(HtmlConverter):