From b3f7e00112443b361f938aa73cdaa07c2dc20972 Mon Sep 17 00:00:00 2001
From: tungsten106 <yexl23000@gmail.com>
Date: Thu, 19 Dec 2024 16:36:05 +0800
Subject: [PATCH] update: change pdf text parser to pymupdf4llm

---
 pyproject.toml                |  1 +
 src/markitdown/_markitdown.py | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c5bd58b..c070663 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
   "pandas",
   "openpyxl",
   "pdfminer.six",
+  "pymupdf4llm",
   "puremagic",
   "pydub",
   "youtube-transcript-api",
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 040a586..82fd83a 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -23,6 +23,7 @@ import markdownify
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
+import pymupdf4llm
 import pptx
 
 # File-format detection
@@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
         if extension.lower() != ".pdf":
             return None
 
-        return DocumentConverterResult(
-            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
-        )
+        # return DocumentConverterResult(
+        #     title=None,
+        #     text_content=pdfminer.high_level.extract_text(local_path),
+        # )
+        text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
+        return DocumentConverterResult(title=None, text_content=text_content)
 
 
 class DocxConverter(HtmlConverter):