diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py index 9014e73..7a30dcc 100644 --- a/packages/markitdown/src/markitdown/_stream_info.py +++ b/packages/markitdown/src/markitdown/_stream_info.py @@ -101,5 +101,4 @@ class StreamInfo: # and, in any case, doesn't add new information. guesses.append(cls(**kwargs)) - # Return the guesses return guesses diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 68860cf..2adcdb9 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -70,7 +70,8 @@ class BingSerpConverter(DocumentConverter): query = parsed_params.get("q", [""])[0] # Parse the stream - soup = BeautifulSoup(file_stream, "html.parser") + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Clean up some formatting for tptt in soup.find_all(class_="tptt"): diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 51eeab7..7d0c916 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -50,7 +50,8 @@ class HtmlConverter(DocumentConverter): **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Parse the stream - soup = BeautifulSoup(file_stream, "html.parser") + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]): diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index 84d8c47..3da5fbd 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -1,5 +1,6 @@ import sys -from typing import Any, Union +from typing import Any, Union, BinaryIO +from .._stream_info import StreamInfo from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -12,6 +13,12 @@ except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/vnd.ms-outlook", +] + +ACCEPTED_FILE_EXTENSIONS = [".msg"] + class OutlookMsgConverter(DocumentConverter): """Converts Outlook .msg files to markdown by extracting email metadata and content. @@ -26,14 +33,52 @@ class OutlookMsgConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not a MSG file - extension = kwargs.get("file_extension", "") - if extension.lower() != ".msg": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + # Check the extension and mimetype + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Brute force, check if we have an OLE file + cur_pos = file_stream.tell() + try: + if not olefile.isOleFile(file_stream): + return False + finally: + file_stream.seek(cur_pos) + + # Brue force, check if it's an Outlook file + try: + msg = olefile.OleFileIO(file_stream) + toc = "\n".join([str(stream) for stream in msg.listdir()]) + return ( + "__properties_version1.0" in toc + and "__recip_version1.0_#00000000" in toc + ) + except Exception as e: + pass + finally: + file_stream.seek(cur_pos) + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -46,41 +91,35 @@ class OutlookMsgConverter(DocumentConverter): _dependency_exc_info[2] ) # Restore the original traceback - try: - msg = olefile.OleFileIO(local_path) - # Extract email metadata - md_content = "# Email Message\n\n" + msg = olefile.OleFileIO(file_stream) + # Extract email metadata + md_content = "# Email Message\n\n" - # Get headers - headers = { - "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), - "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), - "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), - } + # Get headers + headers = { + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), + "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), + "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), + } - # Add headers to markdown - for key, value in headers.items(): - if value: - md_content += f"**{key}:** {value}\n" + # Add headers to markdown + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" - md_content += "\n## Content\n\n" + md_content += "\n## Content\n\n" - # Get email body - body = self._get_stream_data(msg, "__substg1.0_1000001F") - if body: - md_content += body + # Get email body + body = self._get_stream_data(msg, "__substg1.0_1000001F") + if body: + md_content += body - msg.close() + msg.close() - return DocumentConverterResult( - markdown=md_content.strip(), - title=headers.get("Subject"), - ) - - except Exception as e: - raise FileConversionException( - f"Could not convert MSG file '{local_path}': {str(e)}" - ) + return DocumentConverterResult( + markdown=md_content.strip(), + title=headers.get("Subject"), + ) def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: """Helper to safely extract and decode stream data from the MSG file.""" diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 021d09d..3074c6c 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -1,10 +1,27 @@ from xml.dom import minidom -from typing import Union +from typing import BinaryIO, Any, Union from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify +from .._stream_info import StreamInfo from .._base_converter import DocumentConverter, DocumentConverterResult +PRECISE_MIME_TYPE_PREFIXES = [ + "application/rss", + "application/atom", +] + +PRECISE_FILE_EXTENSIONS = [".rss", ".atom"] + +CANDIDATE_MIME_TYPE_PREFIXES = [ + "text/xml", + "application/xml", +] + +CANDIDATE_FILE_EXTENSIONS = [ + ".xml", +] + class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" @@ -14,115 +31,140 @@ class RssConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert( - self, local_path: str, **kwargs - ) -> Union[None, DocumentConverterResult]: - # Bail if not RSS type - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".xml", ".rss", ".atom"]: - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + # Check for precise mimetypes and file extensions + if extension in PRECISE_FILE_EXTENSIONS: + return True + + for prefix in PRECISE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Check for precise mimetypes and file extensions + if extension in CANDIDATE_FILE_EXTENSIONS: + return self._check_xml(file_stream) + + for prefix in CANDIDATE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return self._check_xml(file_stream) + + return False + + def _check_xml(self, file_stream: BinaryIO) -> bool: + cur_pos = file_stream.tell() try: - doc = minidom.parse(local_path) + doc = minidom.parse(file_stream) + return self._feed_type(doc) is not None except BaseException as _: - return None - result = None + pass + finally: + file_stream.seek(cur_pos) + return False + + def _feed_type(self, doc: Any) -> str: if doc.getElementsByTagName("rss"): - # A RSS feed must have a root element of - result = self._parse_rss_type(doc) + return "rss" elif doc.getElementsByTagName("feed"): root = doc.getElementsByTagName("feed")[0] if root.getElementsByTagName("entry"): # An Atom feed must have a root element of and at least one - result = self._parse_atom_type(doc) - else: - return None + return "atom" + return None + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + doc = minidom.parse(file_stream) + feed_type = self._feed_type(doc) + + if feed_type == "rss": + return self._parse_rss_type(doc) + elif feed_type == "atom": + return self._parse_atom_type(doc) else: - # not rss or atom - return None + raise ValueError("Unknown feed type") - return result - - def _parse_atom_type( - self, doc: minidom.Document - ) -> Union[None, DocumentConverterResult]: + def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult: """Parse the type of an Atom feed. Returns None if the feed type is not recognized or something goes wrong. """ - try: - root = doc.getElementsByTagName("feed")[0] - title = self._get_data_by_tag_name(root, "title") - subtitle = self._get_data_by_tag_name(root, "subtitle") - entries = root.getElementsByTagName("entry") - md_text = f"# {title}\n" - if subtitle: - md_text += f"{subtitle}\n" - for entry in entries: - entry_title = self._get_data_by_tag_name(entry, "title") - entry_summary = self._get_data_by_tag_name(entry, "summary") - entry_updated = self._get_data_by_tag_name(entry, "updated") - entry_content = self._get_data_by_tag_name(entry, "content") + root = doc.getElementsByTagName("feed")[0] + title = self._get_data_by_tag_name(root, "title") + subtitle = self._get_data_by_tag_name(root, "subtitle") + entries = root.getElementsByTagName("entry") + md_text = f"# {title}\n" + if subtitle: + md_text += f"{subtitle}\n" + for entry in entries: + entry_title = self._get_data_by_tag_name(entry, "title") + entry_summary = self._get_data_by_tag_name(entry, "summary") + entry_updated = self._get_data_by_tag_name(entry, "updated") + entry_content = self._get_data_by_tag_name(entry, "content") - if entry_title: - md_text += f"\n## {entry_title}\n" - if entry_updated: - md_text += f"Updated on: {entry_updated}\n" - if entry_summary: - md_text += self._parse_content(entry_summary) - if entry_content: - md_text += self._parse_content(entry_content) + if entry_title: + md_text += f"\n## {entry_title}\n" + if entry_updated: + md_text += f"Updated on: {entry_updated}\n" + if entry_summary: + md_text += self._parse_content(entry_summary) + if entry_content: + md_text += self._parse_content(entry_content) - return DocumentConverterResult( - markdown=md_text, - title=title, - ) - except BaseException as _: - return None + return DocumentConverterResult( + markdown=md_text, + title=title, + ) - def _parse_rss_type( - self, doc: minidom.Document - ) -> Union[None, DocumentConverterResult]: + def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult: """Parse the type of an RSS feed. Returns None if the feed type is not recognized or something goes wrong. """ - try: - root = doc.getElementsByTagName("rss")[0] - channel = root.getElementsByTagName("channel") - if not channel: - return None - channel = channel[0] - channel_title = self._get_data_by_tag_name(channel, "title") - channel_description = self._get_data_by_tag_name(channel, "description") - items = channel.getElementsByTagName("item") - if channel_title: - md_text = f"# {channel_title}\n" - if channel_description: - md_text += f"{channel_description}\n" - if not items: - items = [] - for item in items: - title = self._get_data_by_tag_name(item, "title") - description = self._get_data_by_tag_name(item, "description") - pubDate = self._get_data_by_tag_name(item, "pubDate") - content = self._get_data_by_tag_name(item, "content:encoded") - - if title: - md_text += f"\n## {title}\n" - if pubDate: - md_text += f"Published on: {pubDate}\n" - if description: - md_text += self._parse_content(description) - if content: - md_text += self._parse_content(content) - - return DocumentConverterResult( - markdown=md_text, - title=channel_title, - ) - except BaseException as _: - print(traceback.format_exc()) + root = doc.getElementsByTagName("rss")[0] + channel = root.getElementsByTagName("channel") + if not channel: return None + channel = channel[0] + channel_title = self._get_data_by_tag_name(channel, "title") + channel_description = self._get_data_by_tag_name(channel, "description") + items = channel.getElementsByTagName("item") + if channel_title: + md_text = f"# {channel_title}\n" + if channel_description: + md_text += f"{channel_description}\n" + if not items: + items = [] + for item in items: + title = self._get_data_by_tag_name(item, "title") + description = self._get_data_by_tag_name(item, "description") + pubDate = self._get_data_by_tag_name(item, "pubDate") + content = self._get_data_by_tag_name(item, "content:encoded") + + if title: + md_text += f"\n## {title}\n" + if pubDate: + md_text += f"Published on: {pubDate}\n" + if description: + md_text += self._parse_content(description) + if content: + md_text += self._parse_content(content) + + return DocumentConverterResult( + markdown=md_text, + title=channel_title, + ) def _parse_content(self, content: str) -> str: """Parse the content of an RSS feed item""" diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 86e1587..0eedaec 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -61,7 +61,8 @@ class WikipediaConverter(DocumentConverter): **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Parse the stream - soup = BeautifulSoup(file_stream, "html.parser") + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]):