Added Outlook messages.

This commit is contained in:
Adam Fourney 2025-03-04 16:15:07 -08:00
parent 4d09a4c6c6
commit 7879028c98
6 changed files with 213 additions and 130 deletions

View file

@ -101,5 +101,4 @@ class StreamInfo:
# and, in any case, doesn't add new information.
guesses.append(cls(**kwargs))
# Return the guesses
return guesses

View file

@ -70,7 +70,8 @@ class BingSerpConverter(DocumentConverter):
query = parsed_params.get("q", [""])[0]
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Clean up some formatting
for tptt in soup.find_all(class_="tptt"):

View file

@ -50,7 +50,8 @@ class HtmlConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks
for script in soup(["script", "style"]):

View file

@ -1,5 +1,6 @@
import sys
from typing import Any, Union
from typing import Any, Union, BinaryIO
from .._stream_info import StreamInfo
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -12,6 +13,12 @@ except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.ms-outlook",
]
ACCEPTED_FILE_EXTENSIONS = [".msg"]
class OutlookMsgConverter(DocumentConverter):
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
@ -26,14 +33,52 @@ class OutlookMsgConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a MSG file
extension = kwargs.get("file_extension", "")
if extension.lower() != ".msg":
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
# Check the extension and mimetype
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Brute force, check if we have an OLE file
cur_pos = file_stream.tell()
try:
if not olefile.isOleFile(file_stream):
return False
finally:
file_stream.seek(cur_pos)
# Brue force, check if it's an Outlook file
try:
msg = olefile.OleFileIO(file_stream)
toc = "\n".join([str(stream) for stream in msg.listdir()])
return (
"__properties_version1.0" in toc
and "__recip_version1.0_#00000000" in toc
)
except Exception as e:
pass
finally:
file_stream.seek(cur_pos)
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
@ -46,8 +91,7 @@ class OutlookMsgConverter(DocumentConverter):
_dependency_exc_info[2]
) # Restore the original traceback
try:
msg = olefile.OleFileIO(local_path)
msg = olefile.OleFileIO(file_stream)
# Extract email metadata
md_content = "# Email Message\n\n"
@ -77,11 +121,6 @@ class OutlookMsgConverter(DocumentConverter):
title=headers.get("Subject"),
)
except Exception as e:
raise FileConversionException(
f"Could not convert MSG file '{local_path}': {str(e)}"
)
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file."""
assert isinstance(

View file

@ -1,10 +1,27 @@
from xml.dom import minidom
from typing import Union
from typing import BinaryIO, Any, Union
from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify
from .._stream_info import StreamInfo
from .._base_converter import DocumentConverter, DocumentConverterResult
PRECISE_MIME_TYPE_PREFIXES = [
"application/rss",
"application/atom",
]
PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
CANDIDATE_MIME_TYPE_PREFIXES = [
"text/xml",
"application/xml",
]
CANDIDATE_FILE_EXTENSIONS = [
".xml",
]
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
@ -14,42 +31,75 @@ class RssConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]:
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
# Check for precise mimetypes and file extensions
if extension in PRECISE_FILE_EXTENSIONS:
return True
for prefix in PRECISE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Check for precise mimetypes and file extensions
if extension in CANDIDATE_FILE_EXTENSIONS:
return self._check_xml(file_stream)
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return self._check_xml(file_stream)
return False
def _check_xml(self, file_stream: BinaryIO) -> bool:
cur_pos = file_stream.tell()
try:
doc = minidom.parse(local_path)
doc = minidom.parse(file_stream)
return self._feed_type(doc) is not None
except BaseException as _:
return None
result = None
pass
finally:
file_stream.seek(cur_pos)
return False
def _feed_type(self, doc: Any) -> str:
if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss>
result = self._parse_rss_type(doc)
return "rss"
elif doc.getElementsByTagName("feed"):
root = doc.getElementsByTagName("feed")[0]
if root.getElementsByTagName("entry"):
# An Atom feed must have a root element of <feed> and at least one <entry>
result = self._parse_atom_type(doc)
else:
return None
else:
# not rss or atom
return "atom"
return None
return result
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
doc = minidom.parse(file_stream)
feed_type = self._feed_type(doc)
def _parse_atom_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
if feed_type == "rss":
return self._parse_rss_type(doc)
elif feed_type == "atom":
return self._parse_atom_type(doc)
else:
raise ValueError("Unknown feed type")
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
"""Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("feed")[0]
title = self._get_data_by_tag_name(root, "title")
subtitle = self._get_data_by_tag_name(root, "subtitle")
@ -76,17 +126,12 @@ class RssConverter(DocumentConverter):
markdown=md_text,
title=title,
)
except BaseException as _:
return None
def _parse_rss_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
"""Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("rss")[0]
channel = root.getElementsByTagName("channel")
if not channel:
@ -120,9 +165,6 @@ class RssConverter(DocumentConverter):
markdown=md_text,
title=channel_title,
)
except BaseException as _:
print(traceback.format_exc())
return None
def _parse_content(self, content: str) -> str:
"""Parse the content of an RSS feed item"""

View file

@ -61,7 +61,8 @@ class WikipediaConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks
for script in soup(["script", "style"]):