Added pdfs
This commit is contained in:
parent
8362df8e60
commit
11ffd2e550
5 changed files with 30 additions and 7 deletions
|
|
@ -28,7 +28,6 @@ dependencies = [
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify~=0.14.1",
|
"markdownify~=0.14.1",
|
||||||
"numpy",
|
"numpy",
|
||||||
"pdfminer.six",
|
|
||||||
"puremagic",
|
"puremagic",
|
||||||
"pydub",
|
"pydub",
|
||||||
"olefile",
|
"olefile",
|
||||||
|
|
@ -47,12 +46,14 @@ all = [
|
||||||
"mammoth",
|
"mammoth",
|
||||||
"pandas",
|
"pandas",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
"xlrd"
|
"xlrd",
|
||||||
|
"pdfminer.six"
|
||||||
]
|
]
|
||||||
pptx = ["python-pptx"]
|
pptx = ["python-pptx"]
|
||||||
docx = ["mammoth"]
|
docx = ["mammoth"]
|
||||||
xlsx = ["pandas", "openpyxl"]
|
xlsx = ["pandas", "openpyxl"]
|
||||||
xls = ["pandas", "xlrd"]
|
xls = ["pandas", "xlrd"]
|
||||||
|
pdf = ["pdfminer.six"]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ class DocxConverter(HtmlConverter):
|
||||||
if extension.lower() != ".docx":
|
if extension.lower() != ".docx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Load the dependencies
|
# Check: the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,17 @@
|
||||||
import pdfminer
|
import sys
|
||||||
import pdfminer.high_level
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import pdfminer
|
||||||
|
import pdfminer.high_level
|
||||||
|
except ImportError:
|
||||||
|
# Preserve the error and stack trace for later
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
class PdfConverter(DocumentConverter):
|
class PdfConverter(DocumentConverter):
|
||||||
|
|
@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
|
||||||
if extension.lower() != ".pdf":
|
if extension.lower() != ".pdf":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Check the dependencies
|
||||||
|
if _dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
converter=type(self).__name__,
|
||||||
|
extension=".pdf",
|
||||||
|
feature="pdf",
|
||||||
|
)
|
||||||
|
) from _dependency_exc_info[1].with_traceback(
|
||||||
|
_dependency_exc_info[2]
|
||||||
|
) # Restore the original traceback
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=pdfminer.high_level.extract_text(local_path),
|
text_content=pdfminer.high_level.extract_text(local_path),
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ class PptxConverter(HtmlConverter):
|
||||||
if extension.lower() != ".pptx":
|
if extension.lower() != ".pptx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Load the dependencies
|
# Check the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ class XlsxConverter(HtmlConverter):
|
||||||
if extension.lower() != ".xlsx":
|
if extension.lower() != ".xlsx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Load the dependencies
|
# Check the dependencies
|
||||||
if _xlsx_dependency_exc_info is not None:
|
if _xlsx_dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue