update: change pdf text parser to pymupdf4llm
This commit is contained in:
parent
cb66b35f11
commit
b3f7e00112
2 changed files with 8 additions and 4 deletions
|
|
@ -33,6 +33,7 @@ dependencies = [
|
|||
"pandas",
|
||||
"openpyxl",
|
||||
"pdfminer.six",
|
||||
"pymupdf4llm",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"youtube-transcript-api",
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ import markdownify
|
|||
import pandas as pd
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pymupdf4llm
|
||||
import pptx
|
||||
|
||||
# File-format detection
|
||||
|
|
@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
|
|||
if extension.lower() != ".pdf":
|
||||
return None
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=pdfminer.high_level.extract_text(local_path),
|
||||
)
|
||||
# return DocumentConverterResult(
|
||||
# title=None,
|
||||
# text_content=pdfminer.high_level.extract_text(local_path),
|
||||
# )
|
||||
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
|
||||
return DocumentConverterResult(title=None, text_content=text_content)
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
|
|
|
|||
Loading…
Reference in a new issue