diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 8a17099..1d823bf 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -30,7 +30,6 @@ dependencies = [ "numpy", "puremagic", "pydub", - "olefile", "youtube-transcript-api", "SpeechRecognition", "pathvalidate", @@ -47,13 +46,15 @@ all = [ "pandas", "openpyxl", "xlrd", - "pdfminer.six" + "pdfminer.six", + "olefile" ] pptx = ["python-pptx"] docx = ["mammoth"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] +outlook = ["olefile"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index 6764fc5..eb7a065 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -1,6 +1,16 @@ -import olefile +import sys from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import olefile +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() class OutlookMsgConverter(DocumentConverter): @@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter): if extension.lower() != ".msg": return None + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".msg", + feature="outlook", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # Restore the original traceback + try: msg = olefile.OleFileIO(local_path) # Extract email metadata @@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter): f"Could not convert MSG file '{local_path}': {str(e)}" ) - def _get_stream_data( - self, msg: olefile.OleFileIO, stream_path: str - ) -> Union[str, None]: + def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: """Helper to safely extract and decode stream data from the MSG file.""" + assert isinstance( + msg, olefile.OleFileIO + ) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package) + try: if msg.exists(stream_path): data = msg.openstream(stream_path).read()