update: change pdf text parser to pymupdf4llm

2024-12-19 16:36:05 +08:00 · 2024-12-19 16:36:05 +08:00 · b3f7e00112
commit b3f7e00112
parent cb66b35f11
2 changed files with 8 additions and 4 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,6 +33,7 @@ dependencies = [
  "pandas",
  "openpyxl",
  "pdfminer.six",
+  "pymupdf4llm",
  "puremagic",
  "pydub",
  "youtube-transcript-api",
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -23,6 +23,7 @@ import markdownify
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
+import pymupdf4llm
 import pptx

 # File-format detection
@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
        if extension.lower() != ".pdf":
            return None

-        return DocumentConverterResult(
-            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
-        )
+        # return DocumentConverterResult(
+        #     title=None,
+        #     text_content=pdfminer.high_level.extract_text(local_path),
+        # )
+        text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
+        return DocumentConverterResult(title=None, text_content=text_content)


 class DocxConverter(HtmlConverter):