update: change pdf text parser to pymupdf4llm

This commit is contained in:
tungsten106 2024-12-19 16:36:05 +08:00
parent cb66b35f11
commit b3f7e00112
2 changed files with 8 additions and 4 deletions

View file

@ -33,6 +33,7 @@ dependencies = [
"pandas", "pandas",
"openpyxl", "openpyxl",
"pdfminer.six", "pdfminer.six",
"pymupdf4llm",
"puremagic", "puremagic",
"pydub", "pydub",
"youtube-transcript-api", "youtube-transcript-api",

View file

@ -23,6 +23,7 @@ import markdownify
import pandas as pd import pandas as pd
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
import pymupdf4llm
import pptx import pptx
# File-format detection # File-format detection
@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
if extension.lower() != ".pdf": if extension.lower() != ".pdf":
return None return None
return DocumentConverterResult( # return DocumentConverterResult(
title=None, # title=None,
text_content=pdfminer.high_level.extract_text(local_path), # text_content=pdfminer.high_level.extract_text(local_path),
) # )
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
return DocumentConverterResult(title=None, text_content=text_content)
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):