update: change pdf text parser to pymupdf4llm
This commit is contained in:
parent
cb66b35f11
commit
b3f7e00112
2 changed files with 8 additions and 4 deletions
|
|
@ -33,6 +33,7 @@ dependencies = [
|
||||||
"pandas",
|
"pandas",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
"pdfminer.six",
|
"pdfminer.six",
|
||||||
|
"pymupdf4llm",
|
||||||
"puremagic",
|
"puremagic",
|
||||||
"pydub",
|
"pydub",
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api",
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ import markdownify
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
|
import pymupdf4llm
|
||||||
import pptx
|
import pptx
|
||||||
|
|
||||||
# File-format detection
|
# File-format detection
|
||||||
|
|
@ -684,10 +685,12 @@ class PdfConverter(DocumentConverter):
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return DocumentConverterResult(
|
# return DocumentConverterResult(
|
||||||
title=None,
|
# title=None,
|
||||||
text_content=pdfminer.high_level.extract_text(local_path),
|
# text_content=pdfminer.high_level.extract_text(local_path),
|
||||||
)
|
# )
|
||||||
|
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
|
||||||
|
return DocumentConverterResult(title=None, text_content=text_content)
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue