Added Outlook messages.

2025-03-04 16:15:07 -08:00 · 2025-03-04 16:15:07 -08:00 · 7879028c98
commit 7879028c98
parent 4d09a4c6c6
6 changed files with 213 additions and 130 deletions
--- a/packages/markitdown/src/markitdown/_stream_info.py
+++ b/packages/markitdown/src/markitdown/_stream_info.py
@ -101,5 +101,4 @@ class StreamInfo:
                # and, in any case, doesn't add new information.
                guesses.append(cls(**kwargs))

-        # Return the guesses
        return guesses
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -70,7 +70,8 @@ class BingSerpConverter(DocumentConverter):
        query = parsed_params.get("q", [""])[0]

        # Parse the stream
-        soup = BeautifulSoup(file_stream, "html.parser")
+        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Clean up some formatting
        for tptt in soup.find_all(class_="tptt"):
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -50,7 +50,8 @@ class HtmlConverter(DocumentConverter):
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the stream
-        soup = BeautifulSoup(file_stream, "html.parser")
+        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -1,5 +1,6 @@
 import sys
-from typing import Any, Union
+from typing import Any, Union, BinaryIO
+from .._stream_info import StreamInfo
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

@ -12,6 +13,12 @@ except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()

+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/vnd.ms-outlook",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".msg"]
+

 class OutlookMsgConverter(DocumentConverter):
    """Converts Outlook .msg files to markdown by extracting email metadata and content.
@ -26,14 +33,52 @@ class OutlookMsgConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not a MSG file
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".msg":
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()

+        # Check the extension and mimetype
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        # Brute force, check if we have an OLE file
+        cur_pos = file_stream.tell()
+        try:
+            if not olefile.isOleFile(file_stream):
+                return False
+        finally:
+            file_stream.seek(cur_pos)
+
+        # Brue force, check if it's an Outlook file
+        try:
+            msg = olefile.OleFileIO(file_stream)
+            toc = "\n".join([str(stream) for stream in msg.listdir()])
+            return (
+                "__properties_version1.0" in toc
+                and "__recip_version1.0_#00000000" in toc
+            )
+        except Exception as e:
+            pass
+        finally:
+            file_stream.seek(cur_pos)
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Check: the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -46,8 +91,7 @@ class OutlookMsgConverter(DocumentConverter):
                _dependency_exc_info[2]
            )  # Restore the original traceback

-        try:
-            msg = olefile.OleFileIO(local_path)
+        msg = olefile.OleFileIO(file_stream)
        # Extract email metadata
        md_content = "# Email Message\n\n"

@ -77,11 +121,6 @@ class OutlookMsgConverter(DocumentConverter):
            title=headers.get("Subject"),
        )

-        except Exception as e:
-            raise FileConversionException(
-                f"Could not convert MSG file '{local_path}': {str(e)}"
-            )
-
    def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
        """Helper to safely extract and decode stream data from the MSG file."""
        assert isinstance(
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -1,10 +1,27 @@
 from xml.dom import minidom
-from typing import Union
+from typing import BinaryIO, Any, Union
 from bs4 import BeautifulSoup

 from ._markdownify import _CustomMarkdownify
+from .._stream_info import StreamInfo
 from .._base_converter import DocumentConverter, DocumentConverterResult

+PRECISE_MIME_TYPE_PREFIXES = [
+    "application/rss",
+    "application/atom",
+]
+
+PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
+
+CANDIDATE_MIME_TYPE_PREFIXES = [
+    "text/xml",
+    "application/xml",
+]
+
+CANDIDATE_FILE_EXTENSIONS = [
+    ".xml",
+]
+

 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""
@ -14,42 +31,75 @@ class RssConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

-    def convert(
-        self, local_path: str, **kwargs
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not RSS type
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".xml", ".rss", ".atom"]:
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        # Check for precise mimetypes and file extensions
+        if extension in PRECISE_FILE_EXTENSIONS:
+            return True
+
+        for prefix in PRECISE_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        # Check for precise mimetypes and file extensions
+        if extension in CANDIDATE_FILE_EXTENSIONS:
+            return self._check_xml(file_stream)
+
+        for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return self._check_xml(file_stream)
+
+        return False
+
+    def _check_xml(self, file_stream: BinaryIO) -> bool:
+        cur_pos = file_stream.tell()
        try:
-            doc = minidom.parse(local_path)
+            doc = minidom.parse(file_stream)
+            return self._feed_type(doc) is not None
        except BaseException as _:
-            return None
-        result = None
+            pass
+        finally:
+            file_stream.seek(cur_pos)
+        return False
+
+    def _feed_type(self, doc: Any) -> str:
        if doc.getElementsByTagName("rss"):
-            # A RSS feed must have a root element of <rss>
-            result = self._parse_rss_type(doc)
+            return "rss"
        elif doc.getElementsByTagName("feed"):
            root = doc.getElementsByTagName("feed")[0]
            if root.getElementsByTagName("entry"):
                # An Atom feed must have a root element of <feed> and at least one <entry>
-                result = self._parse_atom_type(doc)
-            else:
-                return None
-        else:
-            # not rss or atom
+                return "atom"
        return None

-        return result
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        doc = minidom.parse(file_stream)
+        feed_type = self._feed_type(doc)

-    def _parse_atom_type(
-        self, doc: minidom.Document
-    ) -> Union[None, DocumentConverterResult]:
+        if feed_type == "rss":
+            return self._parse_rss_type(doc)
+        elif feed_type == "atom":
+            return self._parse_atom_type(doc)
+        else:
+            raise ValueError("Unknown feed type")
+
+    def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
        """Parse the type of an Atom feed.

        Returns None if the feed type is not recognized or something goes wrong.
        """
-        try:
        root = doc.getElementsByTagName("feed")[0]
        title = self._get_data_by_tag_name(root, "title")
        subtitle = self._get_data_by_tag_name(root, "subtitle")
@ -76,17 +126,12 @@ class RssConverter(DocumentConverter):
            markdown=md_text,
            title=title,
        )
-        except BaseException as _:
-            return None

-    def _parse_rss_type(
-        self, doc: minidom.Document
-    ) -> Union[None, DocumentConverterResult]:
+    def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
        """Parse the type of an RSS feed.

        Returns None if the feed type is not recognized or something goes wrong.
        """
-        try:
        root = doc.getElementsByTagName("rss")[0]
        channel = root.getElementsByTagName("channel")
        if not channel:
@ -120,9 +165,6 @@ class RssConverter(DocumentConverter):
            markdown=md_text,
            title=channel_title,
        )
-        except BaseException as _:
-            print(traceback.format_exc())
-            return None

    def _parse_content(self, content: str) -> str:
        """Parse the content of an RSS feed item"""
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -61,7 +61,8 @@ class WikipediaConverter(DocumentConverter):
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the stream
-        soup = BeautifulSoup(file_stream, "html.parser")
+        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):