Exploring ways to enable optional dependencies. Starting with pptx.

This commit is contained in:
Adam Fourney 2025-02-28 11:57:51 -08:00
parent 0f63a7e28f
commit 7d2e0bd9d4
3 changed files with 27 additions and 5 deletions

View file

@ -6,7 +6,7 @@ from .__about__ import __version__
from ._markitdown import MarkItDown
from ._exceptions import (
MarkItDownException,
MissingOptionalDependencyException,
MissingDependencyException,
FileConversionException,
UnsupportedFormatException,
)
@ -18,7 +18,7 @@ __all__ = [
"DocumentConverter",
"DocumentConverterResult",
"MarkItDownException",
"MissingOptionalDependencyException",
"MissingDependencyException",
"FileConversionException",
"UnsupportedFormatException",
]

View file

@ -6,7 +6,7 @@ class MarkItDownException(BaseException):
pass
class MissingOptionalDependencyException(MarkItDownException):
class MissingDependencyException(MarkItDownException):
"""
Converters shipped with MarkItDown may depend on optional
dependencies. This exception is thrown when a converter's

View file

@ -1,12 +1,22 @@
import base64
import pptx
import re
import html
import sys
from typing import Union
from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pptx
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class PptxConverter(HtmlConverter):
@ -54,9 +64,21 @@ class PptxConverter(HtmlConverter):
if extension.lower() != ".pptx":
return None
md_content = ""
# Load the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example:
* pip install markitdown[pptx]
* pip install markitdown[all]
* pip install markitdown[pptx, docx, ...]
* etc."""
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
presentation = pptx.Presentation(local_path)
md_content = ""
slide_num = 0
for slide in presentation.slides:
slide_num += 1