Added pdfs

This commit is contained in:
Adam Fourney 2025-02-28 21:35:14 -08:00
parent 8362df8e60
commit 11ffd2e550
5 changed files with 30 additions and 7 deletions

View file

@ -28,7 +28,6 @@ dependencies = [
"requests",
"markdownify~=0.14.1",
"numpy",
"pdfminer.six",
"puremagic",
"pydub",
"olefile",
@ -47,12 +46,14 @@ all = [
"mammoth",
"pandas",
"openpyxl",
"xlrd"
"xlrd",
"pdfminer.six"
]
pptx = ["python-pptx"]
docx = ["mammoth"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -36,7 +36,7 @@ class DocxConverter(HtmlConverter):
if extension.lower() != ".docx":
return None
# Load the dependencies
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(

View file

@ -1,7 +1,17 @@
import pdfminer
import pdfminer.high_level
import sys
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class PdfConverter(DocumentConverter):
@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
if extension.lower() != ".pdf":
return None
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
return DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(local_path),

View file

@ -64,7 +64,7 @@ class PptxConverter(HtmlConverter):
if extension.lower() != ".pptx":
return None
# Load the dependencies
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(

View file

@ -39,7 +39,7 @@ class XlsxConverter(HtmlConverter):
if extension.lower() != ".xlsx":
return None
# Load the dependencies
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(