Added pdfs
This commit is contained in:
parent
8362df8e60
commit
11ffd2e550
5 changed files with 30 additions and 7 deletions
|
|
@ -28,7 +28,6 @@ dependencies = [
|
|||
"requests",
|
||||
"markdownify~=0.14.1",
|
||||
"numpy",
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"olefile",
|
||||
|
|
@ -47,12 +46,14 @@ all = [
|
|||
"mammoth",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd"
|
||||
"xlrd",
|
||||
"pdfminer.six"
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ class DocxConverter(HtmlConverter):
|
|||
if extension.lower() != ".docx":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
|
|
|
|||
|
|
@ -1,7 +1,17 @@
|
|||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import sys
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
|
|
@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
|
|||
if extension.lower() != ".pdf":
|
||||
return None
|
||||
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=pdfminer.high_level.extract_text(local_path),
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ class PptxConverter(HtmlConverter):
|
|||
if extension.lower() != ".pptx":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ class XlsxConverter(HtmlConverter):
|
|||
if extension.lower() != ".xlsx":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
# Check the dependencies
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
|
|
|
|||
Loading…
Reference in a new issue