Added Outlook messages.
This commit is contained in:
parent
4d09a4c6c6
commit
7879028c98
6 changed files with 213 additions and 130 deletions
|
|
@ -101,5 +101,4 @@ class StreamInfo:
|
||||||
# and, in any case, doesn't add new information.
|
# and, in any case, doesn't add new information.
|
||||||
guesses.append(cls(**kwargs))
|
guesses.append(cls(**kwargs))
|
||||||
|
|
||||||
# Return the guesses
|
|
||||||
return guesses
|
return guesses
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,8 @@ class BingSerpConverter(DocumentConverter):
|
||||||
query = parsed_params.get("q", [""])[0]
|
query = parsed_params.get("q", [""])[0]
|
||||||
|
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
soup = BeautifulSoup(file_stream, "html.parser")
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
# Clean up some formatting
|
# Clean up some formatting
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,8 @@ class HtmlConverter(DocumentConverter):
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
soup = BeautifulSoup(file_stream, "html.parser")
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Union
|
from typing import Any, Union, BinaryIO
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
@ -12,6 +13,12 @@ except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.ms-outlook",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".msg"]
|
||||||
|
|
||||||
|
|
||||||
class OutlookMsgConverter(DocumentConverter):
|
class OutlookMsgConverter(DocumentConverter):
|
||||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||||
|
|
@ -26,14 +33,52 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def accepts(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not a MSG file
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() != ".msg":
|
) -> bool:
|
||||||
return None
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
# Check the extension and mimetype
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Brute force, check if we have an OLE file
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
if not olefile.isOleFile(file_stream):
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Brue force, check if it's an Outlook file
|
||||||
|
try:
|
||||||
|
msg = olefile.OleFileIO(file_stream)
|
||||||
|
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||||
|
return (
|
||||||
|
"__properties_version1.0" in toc
|
||||||
|
and "__recip_version1.0_#00000000" in toc
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check: the dependencies
|
# Check: the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
|
|
@ -46,41 +91,35 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
) # Restore the original traceback
|
||||||
|
|
||||||
try:
|
msg = olefile.OleFileIO(file_stream)
|
||||||
msg = olefile.OleFileIO(local_path)
|
# Extract email metadata
|
||||||
# Extract email metadata
|
md_content = "# Email Message\n\n"
|
||||||
md_content = "# Email Message\n\n"
|
|
||||||
|
|
||||||
# Get headers
|
# Get headers
|
||||||
headers = {
|
headers = {
|
||||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add headers to markdown
|
# Add headers to markdown
|
||||||
for key, value in headers.items():
|
for key, value in headers.items():
|
||||||
if value:
|
if value:
|
||||||
md_content += f"**{key}:** {value}\n"
|
md_content += f"**{key}:** {value}\n"
|
||||||
|
|
||||||
md_content += "\n## Content\n\n"
|
md_content += "\n## Content\n\n"
|
||||||
|
|
||||||
# Get email body
|
# Get email body
|
||||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||||
if body:
|
if body:
|
||||||
md_content += body
|
md_content += body
|
||||||
|
|
||||||
msg.close()
|
msg.close()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=md_content.strip(),
|
markdown=md_content.strip(),
|
||||||
title=headers.get("Subject"),
|
title=headers.get("Subject"),
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise FileConversionException(
|
|
||||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,27 @@
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Union
|
from typing import BinaryIO, Any, Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
|
PRECISE_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/rss",
|
||||||
|
"application/atom",
|
||||||
|
]
|
||||||
|
|
||||||
|
PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
|
||||||
|
|
||||||
|
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/xml",
|
||||||
|
"application/xml",
|
||||||
|
]
|
||||||
|
|
||||||
|
CANDIDATE_FILE_EXTENSIONS = [
|
||||||
|
".xml",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
@ -14,115 +31,140 @@ class RssConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def accepts(
|
||||||
self, local_path: str, **kwargs
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not RSS type
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
) -> bool:
|
||||||
return None
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
# Check for precise mimetypes and file extensions
|
||||||
|
if extension in PRECISE_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in PRECISE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check for precise mimetypes and file extensions
|
||||||
|
if extension in CANDIDATE_FILE_EXTENSIONS:
|
||||||
|
return self._check_xml(file_stream)
|
||||||
|
|
||||||
|
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return self._check_xml(file_stream)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _check_xml(self, file_stream: BinaryIO) -> bool:
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
doc = minidom.parse(local_path)
|
doc = minidom.parse(file_stream)
|
||||||
|
return self._feed_type(doc) is not None
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return None
|
pass
|
||||||
result = None
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _feed_type(self, doc: Any) -> str:
|
||||||
if doc.getElementsByTagName("rss"):
|
if doc.getElementsByTagName("rss"):
|
||||||
# A RSS feed must have a root element of <rss>
|
return "rss"
|
||||||
result = self._parse_rss_type(doc)
|
|
||||||
elif doc.getElementsByTagName("feed"):
|
elif doc.getElementsByTagName("feed"):
|
||||||
root = doc.getElementsByTagName("feed")[0]
|
root = doc.getElementsByTagName("feed")[0]
|
||||||
if root.getElementsByTagName("entry"):
|
if root.getElementsByTagName("entry"):
|
||||||
# An Atom feed must have a root element of <feed> and at least one <entry>
|
# An Atom feed must have a root element of <feed> and at least one <entry>
|
||||||
result = self._parse_atom_type(doc)
|
return "atom"
|
||||||
else:
|
return None
|
||||||
return None
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
doc = minidom.parse(file_stream)
|
||||||
|
feed_type = self._feed_type(doc)
|
||||||
|
|
||||||
|
if feed_type == "rss":
|
||||||
|
return self._parse_rss_type(doc)
|
||||||
|
elif feed_type == "atom":
|
||||||
|
return self._parse_atom_type(doc)
|
||||||
else:
|
else:
|
||||||
# not rss or atom
|
raise ValueError("Unknown feed type")
|
||||||
return None
|
|
||||||
|
|
||||||
return result
|
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||||
|
|
||||||
def _parse_atom_type(
|
|
||||||
self, doc: minidom.Document
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Parse the type of an Atom feed.
|
"""Parse the type of an Atom feed.
|
||||||
|
|
||||||
Returns None if the feed type is not recognized or something goes wrong.
|
Returns None if the feed type is not recognized or something goes wrong.
|
||||||
"""
|
"""
|
||||||
try:
|
root = doc.getElementsByTagName("feed")[0]
|
||||||
root = doc.getElementsByTagName("feed")[0]
|
title = self._get_data_by_tag_name(root, "title")
|
||||||
title = self._get_data_by_tag_name(root, "title")
|
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
||||||
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
entries = root.getElementsByTagName("entry")
|
||||||
entries = root.getElementsByTagName("entry")
|
md_text = f"# {title}\n"
|
||||||
md_text = f"# {title}\n"
|
if subtitle:
|
||||||
if subtitle:
|
md_text += f"{subtitle}\n"
|
||||||
md_text += f"{subtitle}\n"
|
for entry in entries:
|
||||||
for entry in entries:
|
entry_title = self._get_data_by_tag_name(entry, "title")
|
||||||
entry_title = self._get_data_by_tag_name(entry, "title")
|
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
||||||
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
||||||
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
entry_content = self._get_data_by_tag_name(entry, "content")
|
||||||
entry_content = self._get_data_by_tag_name(entry, "content")
|
|
||||||
|
|
||||||
if entry_title:
|
if entry_title:
|
||||||
md_text += f"\n## {entry_title}\n"
|
md_text += f"\n## {entry_title}\n"
|
||||||
if entry_updated:
|
if entry_updated:
|
||||||
md_text += f"Updated on: {entry_updated}\n"
|
md_text += f"Updated on: {entry_updated}\n"
|
||||||
if entry_summary:
|
if entry_summary:
|
||||||
md_text += self._parse_content(entry_summary)
|
md_text += self._parse_content(entry_summary)
|
||||||
if entry_content:
|
if entry_content:
|
||||||
md_text += self._parse_content(entry_content)
|
md_text += self._parse_content(entry_content)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=md_text,
|
markdown=md_text,
|
||||||
title=title,
|
title=title,
|
||||||
)
|
)
|
||||||
except BaseException as _:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_rss_type(
|
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||||
self, doc: minidom.Document
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Parse the type of an RSS feed.
|
"""Parse the type of an RSS feed.
|
||||||
|
|
||||||
Returns None if the feed type is not recognized or something goes wrong.
|
Returns None if the feed type is not recognized or something goes wrong.
|
||||||
"""
|
"""
|
||||||
try:
|
root = doc.getElementsByTagName("rss")[0]
|
||||||
root = doc.getElementsByTagName("rss")[0]
|
channel = root.getElementsByTagName("channel")
|
||||||
channel = root.getElementsByTagName("channel")
|
if not channel:
|
||||||
if not channel:
|
|
||||||
return None
|
|
||||||
channel = channel[0]
|
|
||||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
|
||||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
|
||||||
items = channel.getElementsByTagName("item")
|
|
||||||
if channel_title:
|
|
||||||
md_text = f"# {channel_title}\n"
|
|
||||||
if channel_description:
|
|
||||||
md_text += f"{channel_description}\n"
|
|
||||||
if not items:
|
|
||||||
items = []
|
|
||||||
for item in items:
|
|
||||||
title = self._get_data_by_tag_name(item, "title")
|
|
||||||
description = self._get_data_by_tag_name(item, "description")
|
|
||||||
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
|
||||||
content = self._get_data_by_tag_name(item, "content:encoded")
|
|
||||||
|
|
||||||
if title:
|
|
||||||
md_text += f"\n## {title}\n"
|
|
||||||
if pubDate:
|
|
||||||
md_text += f"Published on: {pubDate}\n"
|
|
||||||
if description:
|
|
||||||
md_text += self._parse_content(description)
|
|
||||||
if content:
|
|
||||||
md_text += self._parse_content(content)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
markdown=md_text,
|
|
||||||
title=channel_title,
|
|
||||||
)
|
|
||||||
except BaseException as _:
|
|
||||||
print(traceback.format_exc())
|
|
||||||
return None
|
return None
|
||||||
|
channel = channel[0]
|
||||||
|
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||||
|
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||||
|
items = channel.getElementsByTagName("item")
|
||||||
|
if channel_title:
|
||||||
|
md_text = f"# {channel_title}\n"
|
||||||
|
if channel_description:
|
||||||
|
md_text += f"{channel_description}\n"
|
||||||
|
if not items:
|
||||||
|
items = []
|
||||||
|
for item in items:
|
||||||
|
title = self._get_data_by_tag_name(item, "title")
|
||||||
|
description = self._get_data_by_tag_name(item, "description")
|
||||||
|
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
||||||
|
content = self._get_data_by_tag_name(item, "content:encoded")
|
||||||
|
|
||||||
|
if title:
|
||||||
|
md_text += f"\n## {title}\n"
|
||||||
|
if pubDate:
|
||||||
|
md_text += f"Published on: {pubDate}\n"
|
||||||
|
if description:
|
||||||
|
md_text += self._parse_content(description)
|
||||||
|
if content:
|
||||||
|
md_text += self._parse_content(content)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
markdown=md_text,
|
||||||
|
title=channel_title,
|
||||||
|
)
|
||||||
|
|
||||||
def _parse_content(self, content: str) -> str:
|
def _parse_content(self, content: str) -> str:
|
||||||
"""Parse the content of an RSS feed item"""
|
"""Parse the content of an RSS feed item"""
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,8 @@ class WikipediaConverter(DocumentConverter):
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
soup = BeautifulSoup(file_stream, "html.parser")
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue