Added Outlook messages.

This commit is contained in:
Adam Fourney 2025-03-04 16:15:07 -08:00
parent 4d09a4c6c6
commit 7879028c98
6 changed files with 213 additions and 130 deletions

View file

@ -101,5 +101,4 @@ class StreamInfo:
# and, in any case, doesn't add new information. # and, in any case, doesn't add new information.
guesses.append(cls(**kwargs)) guesses.append(cls(**kwargs))
# Return the guesses
return guesses return guesses

View file

@ -70,7 +70,8 @@ class BingSerpConverter(DocumentConverter):
query = parsed_params.get("q", [""])[0] query = parsed_params.get("q", [""])[0]
# Parse the stream # Parse the stream
soup = BeautifulSoup(file_stream, "html.parser") encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Clean up some formatting # Clean up some formatting
for tptt in soup.find_all(class_="tptt"): for tptt in soup.find_all(class_="tptt"):

View file

@ -50,7 +50,8 @@ class HtmlConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
soup = BeautifulSoup(file_stream, "html.parser") encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):

View file

@ -1,5 +1,6 @@
import sys import sys
from typing import Any, Union from typing import Any, Union, BinaryIO
from .._stream_info import StreamInfo
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -12,6 +13,12 @@ except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info() _dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.ms-outlook",
]
ACCEPTED_FILE_EXTENSIONS = [".msg"]
class OutlookMsgConverter(DocumentConverter): class OutlookMsgConverter(DocumentConverter):
"""Converts Outlook .msg files to markdown by extracting email metadata and content. """Converts Outlook .msg files to markdown by extracting email metadata and content.
@ -26,14 +33,52 @@ class OutlookMsgConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def accepts(
self, local_path: str, **kwargs: Any self,
) -> Union[None, DocumentConverterResult]: file_stream: BinaryIO,
# Bail if not a MSG file stream_info: StreamInfo,
extension = kwargs.get("file_extension", "") **kwargs: Any, # Options to pass to the converter
if extension.lower() != ".msg": ) -> bool:
return None mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
# Check the extension and mimetype
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Brute force, check if we have an OLE file
cur_pos = file_stream.tell()
try:
if not olefile.isOleFile(file_stream):
return False
finally:
file_stream.seek(cur_pos)
# Brue force, check if it's an Outlook file
try:
msg = olefile.OleFileIO(file_stream)
toc = "\n".join([str(stream) for stream in msg.listdir()])
return (
"__properties_version1.0" in toc
and "__recip_version1.0_#00000000" in toc
)
except Exception as e:
pass
finally:
file_stream.seek(cur_pos)
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies # Check: the dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
@ -46,41 +91,35 @@ class OutlookMsgConverter(DocumentConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) # Restore the original traceback ) # Restore the original traceback
try: msg = olefile.OleFileIO(file_stream)
msg = olefile.OleFileIO(local_path) # Extract email metadata
# Extract email metadata md_content = "# Email Message\n\n"
md_content = "# Email Message\n\n"
# Get headers # Get headers
headers = { headers = {
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"), "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
} }
# Add headers to markdown # Add headers to markdown
for key, value in headers.items(): for key, value in headers.items():
if value: if value:
md_content += f"**{key}:** {value}\n" md_content += f"**{key}:** {value}\n"
md_content += "\n## Content\n\n" md_content += "\n## Content\n\n"
# Get email body # Get email body
body = self._get_stream_data(msg, "__substg1.0_1000001F") body = self._get_stream_data(msg, "__substg1.0_1000001F")
if body: if body:
md_content += body md_content += body
msg.close() msg.close()
return DocumentConverterResult( return DocumentConverterResult(
markdown=md_content.strip(), markdown=md_content.strip(),
title=headers.get("Subject"), title=headers.get("Subject"),
) )
except Exception as e:
raise FileConversionException(
f"Could not convert MSG file '{local_path}': {str(e)}"
)
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file.""" """Helper to safely extract and decode stream data from the MSG file."""

View file

@ -1,10 +1,27 @@
from xml.dom import minidom from xml.dom import minidom
from typing import Union from typing import BinaryIO, Any, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from .._stream_info import StreamInfo
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
PRECISE_MIME_TYPE_PREFIXES = [
"application/rss",
"application/atom",
]
PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
CANDIDATE_MIME_TYPE_PREFIXES = [
"text/xml",
"application/xml",
]
CANDIDATE_FILE_EXTENSIONS = [
".xml",
]
class RssConverter(DocumentConverter): class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown""" """Convert RSS / Atom type to markdown"""
@ -14,115 +31,140 @@ class RssConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def accepts(
self, local_path: str, **kwargs self,
) -> Union[None, DocumentConverterResult]: file_stream: BinaryIO,
# Bail if not RSS type stream_info: StreamInfo,
extension = kwargs.get("file_extension", "") **kwargs: Any, # Options to pass to the converter
if extension.lower() not in [".xml", ".rss", ".atom"]: ) -> bool:
return None mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
# Check for precise mimetypes and file extensions
if extension in PRECISE_FILE_EXTENSIONS:
return True
for prefix in PRECISE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Check for precise mimetypes and file extensions
if extension in CANDIDATE_FILE_EXTENSIONS:
return self._check_xml(file_stream)
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return self._check_xml(file_stream)
return False
def _check_xml(self, file_stream: BinaryIO) -> bool:
cur_pos = file_stream.tell()
try: try:
doc = minidom.parse(local_path) doc = minidom.parse(file_stream)
return self._feed_type(doc) is not None
except BaseException as _: except BaseException as _:
return None pass
result = None finally:
file_stream.seek(cur_pos)
return False
def _feed_type(self, doc: Any) -> str:
if doc.getElementsByTagName("rss"): if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss> return "rss"
result = self._parse_rss_type(doc)
elif doc.getElementsByTagName("feed"): elif doc.getElementsByTagName("feed"):
root = doc.getElementsByTagName("feed")[0] root = doc.getElementsByTagName("feed")[0]
if root.getElementsByTagName("entry"): if root.getElementsByTagName("entry"):
# An Atom feed must have a root element of <feed> and at least one <entry> # An Atom feed must have a root element of <feed> and at least one <entry>
result = self._parse_atom_type(doc) return "atom"
else: return None
return None
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
doc = minidom.parse(file_stream)
feed_type = self._feed_type(doc)
if feed_type == "rss":
return self._parse_rss_type(doc)
elif feed_type == "atom":
return self._parse_atom_type(doc)
else: else:
# not rss or atom raise ValueError("Unknown feed type")
return None
return result def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
def _parse_atom_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an Atom feed. """Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong. Returns None if the feed type is not recognized or something goes wrong.
""" """
try: root = doc.getElementsByTagName("feed")[0]
root = doc.getElementsByTagName("feed")[0] title = self._get_data_by_tag_name(root, "title")
title = self._get_data_by_tag_name(root, "title") subtitle = self._get_data_by_tag_name(root, "subtitle")
subtitle = self._get_data_by_tag_name(root, "subtitle") entries = root.getElementsByTagName("entry")
entries = root.getElementsByTagName("entry") md_text = f"# {title}\n"
md_text = f"# {title}\n" if subtitle:
if subtitle: md_text += f"{subtitle}\n"
md_text += f"{subtitle}\n" for entry in entries:
for entry in entries: entry_title = self._get_data_by_tag_name(entry, "title")
entry_title = self._get_data_by_tag_name(entry, "title") entry_summary = self._get_data_by_tag_name(entry, "summary")
entry_summary = self._get_data_by_tag_name(entry, "summary") entry_updated = self._get_data_by_tag_name(entry, "updated")
entry_updated = self._get_data_by_tag_name(entry, "updated") entry_content = self._get_data_by_tag_name(entry, "content")
entry_content = self._get_data_by_tag_name(entry, "content")
if entry_title: if entry_title:
md_text += f"\n## {entry_title}\n" md_text += f"\n## {entry_title}\n"
if entry_updated: if entry_updated:
md_text += f"Updated on: {entry_updated}\n" md_text += f"Updated on: {entry_updated}\n"
if entry_summary: if entry_summary:
md_text += self._parse_content(entry_summary) md_text += self._parse_content(entry_summary)
if entry_content: if entry_content:
md_text += self._parse_content(entry_content) md_text += self._parse_content(entry_content)
return DocumentConverterResult( return DocumentConverterResult(
markdown=md_text, markdown=md_text,
title=title, title=title,
) )
except BaseException as _:
return None
def _parse_rss_type( def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an RSS feed. """Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong. Returns None if the feed type is not recognized or something goes wrong.
""" """
try: root = doc.getElementsByTagName("rss")[0]
root = doc.getElementsByTagName("rss")[0] channel = root.getElementsByTagName("channel")
channel = root.getElementsByTagName("channel") if not channel:
if not channel:
return None
channel = channel[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
if channel_title:
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
pubDate = self._get_data_by_tag_name(item, "pubDate")
content = self._get_data_by_tag_name(item, "content:encoded")
if title:
md_text += f"\n## {title}\n"
if pubDate:
md_text += f"Published on: {pubDate}\n"
if description:
md_text += self._parse_content(description)
if content:
md_text += self._parse_content(content)
return DocumentConverterResult(
markdown=md_text,
title=channel_title,
)
except BaseException as _:
print(traceback.format_exc())
return None return None
channel = channel[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
if channel_title:
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
pubDate = self._get_data_by_tag_name(item, "pubDate")
content = self._get_data_by_tag_name(item, "content:encoded")
if title:
md_text += f"\n## {title}\n"
if pubDate:
md_text += f"Published on: {pubDate}\n"
if description:
md_text += self._parse_content(description)
if content:
md_text += self._parse_content(content)
return DocumentConverterResult(
markdown=md_text,
title=channel_title,
)
def _parse_content(self, content: str) -> str: def _parse_content(self, content: str) -> str:
"""Parse the content of an RSS feed item""" """Parse the content of an RSS feed item"""

View file

@ -61,7 +61,8 @@ class WikipediaConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
soup = BeautifulSoup(file_stream, "html.parser") encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):