Added pdfs

This commit is contained in:
Adam Fourney 2025-02-28 21:35:14 -08:00
parent 8362df8e60
commit 11ffd2e550
5 changed files with 30 additions and 7 deletions

View file

@ -28,7 +28,6 @@ dependencies = [
"requests", "requests",
"markdownify~=0.14.1", "markdownify~=0.14.1",
"numpy", "numpy",
"pdfminer.six",
"puremagic", "puremagic",
"pydub", "pydub",
"olefile", "olefile",
@ -47,12 +46,14 @@ all = [
"mammoth", "mammoth",
"pandas", "pandas",
"openpyxl", "openpyxl",
"xlrd" "xlrd",
"pdfminer.six"
] ]
pptx = ["python-pptx"] pptx = ["python-pptx"]
docx = ["mammoth"] docx = ["mammoth"]
xlsx = ["pandas", "openpyxl"] xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"] xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
[project.urls] [project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme" Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -36,7 +36,7 @@ class DocxConverter(HtmlConverter):
if extension.lower() != ".docx": if extension.lower() != ".docx":
return None return None
# Load the dependencies # Check: the dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format( MISSING_DEPENDENCY_MESSAGE.format(

View file

@ -1,7 +1,17 @@
import pdfminer import sys
import pdfminer.high_level
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class PdfConverter(DocumentConverter): class PdfConverter(DocumentConverter):
@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
if extension.lower() != ".pdf": if extension.lower() != ".pdf":
return None return None
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=pdfminer.high_level.extract_text(local_path), text_content=pdfminer.high_level.extract_text(local_path),

View file

@ -64,7 +64,7 @@ class PptxConverter(HtmlConverter):
if extension.lower() != ".pptx": if extension.lower() != ".pptx":
return None return None
# Load the dependencies # Check the dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format( MISSING_DEPENDENCY_MESSAGE.format(

View file

@ -39,7 +39,7 @@ class XlsxConverter(HtmlConverter):
if extension.lower() != ".xlsx": if extension.lower() != ".xlsx":
return None return None
# Load the dependencies # Check the dependencies
if _xlsx_dependency_exc_info is not None: if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format( MISSING_DEPENDENCY_MESSAGE.format(