update: change pdf text parser to pymupdf4llm

This commit is contained in:
tungsten106 2024-12-19 16:36:05 +08:00
parent cb66b35f11
commit b3f7e00112
2 changed files with 8 additions and 4 deletions

View file

@ -33,6 +33,7 @@ dependencies = [
"pandas",
"openpyxl",
"pdfminer.six",
"pymupdf4llm",
"puremagic",
"pydub",
"youtube-transcript-api",

View file

@ -23,6 +23,7 @@ import markdownify
import pandas as pd
import pdfminer
import pdfminer.high_level
import pymupdf4llm
import pptx
# File-format detection
@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
if extension.lower() != ".pdf":
return None
return DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(local_path),
)
# return DocumentConverterResult(
# title=None,
# text_content=pdfminer.high_level.extract_text(local_path),
# )
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
return DocumentConverterResult(title=None, text_content=text_content)
class DocxConverter(HtmlConverter):