update: change pdf text parser to pymupdf4llm

2024-12-19 16:36:05 +08:00 · 2024-12-19 16:36:05 +08:00 · b3f7e00112
commit b3f7e00112
parent cb66b35f11
2 changed files with 8 additions and 4 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,6 +33,7 @@ dependencies = [
  "pandas",
  "openpyxl",
  "pdfminer.six",
  "pymupdf4llm",
  "puremagic",
  "pydub",
  "youtube-transcript-api",
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -23,6 +23,7 @@ import markdownify
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
 import pymupdf4llm
 import pptx
 # File-format detection
@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
        if extension.lower() != ".pdf":
            return None
-        return DocumentConverterResult(
+        # return DocumentConverterResult(
-            title=None,
+        #     title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
+        #     text_content=pdfminer.high_level.extract_text(local_path),
-        )
+        # )
        text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
        return DocumentConverterResult(title=None, text_content=text_content)
 class DocxConverter(HtmlConverter):