Added Ole files.
This commit is contained in:
parent
11ffd2e550
commit
a2cf8ee889
2 changed files with 31 additions and 6 deletions
|
|
@ -30,7 +30,6 @@ dependencies = [
|
||||||
"numpy",
|
"numpy",
|
||||||
"puremagic",
|
"puremagic",
|
||||||
"pydub",
|
"pydub",
|
||||||
"olefile",
|
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"pathvalidate",
|
"pathvalidate",
|
||||||
|
|
@ -47,13 +46,15 @@ all = [
|
||||||
"pandas",
|
"pandas",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
"xlrd",
|
"xlrd",
|
||||||
"pdfminer.six"
|
"pdfminer.six",
|
||||||
|
"olefile"
|
||||||
]
|
]
|
||||||
pptx = ["python-pptx"]
|
pptx = ["python-pptx"]
|
||||||
docx = ["mammoth"]
|
docx = ["mammoth"]
|
||||||
xlsx = ["pandas", "openpyxl"]
|
xlsx = ["pandas", "openpyxl"]
|
||||||
xls = ["pandas", "xlrd"]
|
xls = ["pandas", "xlrd"]
|
||||||
pdf = ["pdfminer.six"]
|
pdf = ["pdfminer.six"]
|
||||||
|
outlook = ["olefile"]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,16 @@
|
||||||
import olefile
|
import sys
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import olefile
|
||||||
|
except ImportError:
|
||||||
|
# Preserve the error and stack trace for later
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
class OutlookMsgConverter(DocumentConverter):
|
class OutlookMsgConverter(DocumentConverter):
|
||||||
|
|
@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
if extension.lower() != ".msg":
|
if extension.lower() != ".msg":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Check: the dependencies
|
||||||
|
if _dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
converter=type(self).__name__,
|
||||||
|
extension=".msg",
|
||||||
|
feature="outlook",
|
||||||
|
)
|
||||||
|
) from _dependency_exc_info[1].with_traceback(
|
||||||
|
_dependency_exc_info[2]
|
||||||
|
) # Restore the original traceback
|
||||||
|
|
||||||
try:
|
try:
|
||||||
msg = olefile.OleFileIO(local_path)
|
msg = olefile.OleFileIO(local_path)
|
||||||
# Extract email metadata
|
# Extract email metadata
|
||||||
|
|
@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_stream_data(
|
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||||
self, msg: olefile.OleFileIO, stream_path: str
|
|
||||||
) -> Union[str, None]:
|
|
||||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||||
|
assert isinstance(
|
||||||
|
msg, olefile.OleFileIO
|
||||||
|
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if msg.exists(stream_path):
|
if msg.exists(stream_path):
|
||||||
data = msg.openstream(stream_path).read()
|
data = msg.openstream(stream_path).read()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue