diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 254fccb..8a17099 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -28,7 +28,6 @@ dependencies = [ "requests", "markdownify~=0.14.1", "numpy", - "pdfminer.six", "puremagic", "pydub", "olefile", @@ -47,12 +46,14 @@ all = [ "mammoth", "pandas", "openpyxl", - "xlrd" + "xlrd", + "pdfminer.six" ] pptx = ["python-pptx"] docx = ["mammoth"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] +pdf = ["pdfminer.six"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index de4bf0d..0866e59 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -36,7 +36,7 @@ class DocxConverter(HtmlConverter): if extension.lower() != ".docx": return None - # Load the dependencies + # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 3a2b671..3c5ecad 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,7 +1,17 @@ -import pdfminer -import pdfminer.high_level +import sys from typing import Union from ._base import DocumentConverter, DocumentConverterResult +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import pdfminer + import pdfminer.high_level +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() class PdfConverter(DocumentConverter): @@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter): if extension.lower() != ".pdf": return None + # Check the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pdf", + feature="pdf", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # Restore the original traceback + return DocumentConverterResult( title=None, text_content=pdfminer.high_level.extract_text(local_path), diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index e8cbf71..431b6a0 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -64,7 +64,7 @@ class PptxConverter(HtmlConverter): if extension.lower() != ".pptx": return None - # Load the dependencies + # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 61c7e2c..56398ca 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -39,7 +39,7 @@ class XlsxConverter(HtmlConverter): if extension.lower() != ".xlsx": return None - # Load the dependencies + # Check the dependencies if _xlsx_dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format(