Added Ole files.

This commit is contained in:
Adam Fourney 2025-02-28 21:55:26 -08:00
parent 11ffd2e550
commit a2cf8ee889
2 changed files with 31 additions and 6 deletions

View file

@ -30,7 +30,6 @@ dependencies = [
"numpy",
"puremagic",
"pydub",
"olefile",
"youtube-transcript-api",
"SpeechRecognition",
"pathvalidate",
@ -47,13 +46,15 @@ all = [
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six"
"pdfminer.six",
"olefile"
]
pptx = ["python-pptx"]
docx = ["mammoth"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
outlook = ["olefile"]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -1,6 +1,16 @@
import olefile
import sys
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import olefile
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class OutlookMsgConverter(DocumentConverter):
@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter):
if extension.lower() != ".msg":
return None
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".msg",
feature="outlook",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
try:
msg = olefile.OleFileIO(local_path)
# Extract email metadata
@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter):
f"Could not convert MSG file '{local_path}': {str(e)}"
)
def _get_stream_data(
self, msg: olefile.OleFileIO, stream_path: str
) -> Union[str, None]:
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file."""
assert isinstance(
msg, olefile.OleFileIO
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
try:
if msg.exists(stream_path):
data = msg.openstream(stream_path).read()