Updating converters.

2025-03-04 13:57:49 -08:00 · 2025-03-04 13:57:49 -08:00 · 4d09a4c6c6
commit 4d09a4c6c6
parent df372fa460
8 changed files with 366 additions and 207 deletions
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -80,23 +80,46 @@ class DocumentConverter:
        """
        self._priority = priority

-    def convert_stream(
+    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
-    ) -> Union[None, DocumentConverterResult]:
+    ) -> bool:
        """
-        Convert a document to Markdown text, or return None if the converter
-        cannot handle the document (causing the next converter to be tried).
+        Return a quick determination on if the converter should attempt converting the document.
+        This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
+        In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to
+        make a determination (e.g., special converters for Wikipedia, YouTube etc).
+        Finally, it is conceivable that the `stream_info.filename` might be used to in cases
+        where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)

-        The determination of whether a converter can handle a document is primarily based on
-        the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
-        a secondary check if the MIME type is not sufficiently specific
-        (e.g., application/octet-stream). In the case of data retreived via HTTP, the
-        `steam_info.url` might also be referenced to guide conversion (e.g., special-handling
-        for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
-        of the file content in cases of text/*
+        NOTE: The method signature is designed to match that of the convert() method. This provides some
+        assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
+
+        IMPORTANT: If this method advances the position in file_stream, it must also reset the position before
+        returning. This is because the convert() method may be called immediately after accepts().
+
+        Prameters:
+        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
+        - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
+        - kwargs: Additional keyword arguments for the converter.
+
+        Returns:
+        - bool: True if the converter can handle the document, False otherwise.
+        """
+        raise NotImplementedError(
+            f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
+        )
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        """
+        Convert a document to Markdown text.

        Prameters:
        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
@ -105,68 +128,11 @@ class DocumentConverter:

        Returns:
        - DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
-        or
-        - None: If the converter cannot handle the document.

        Raises:
        - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
-
-        # Default implementation ensures backward compatibility with the legacy convert() method, and
-        # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
-        # in the future.
-        result = None
-        used_legacy = False
-
-        if stream_info.local_path is not None and os.path.exists(
-            stream_info.local_path
-        ):
-            # If the stream is backed by a local file, pass it to the legacy convert() method
-            try:
-                result = self.convert(stream_info.local_path, **kwargs)
-                used_legacy = True
-            except (
-                NotImplementedError
-            ):  # If it wasn't implemented, rethrow the error, but with this as the stack trace
-                raise NotImplementedError(
-                    "Subclasses must implement the convert_stream method."
-                )
-        else:
-            # Otherwise, we need to read the stream into a temporary file. There is potential for
-            # thrashing here if there are many converters or conversion attempts
-            cur_pos = file_stream.tell()
-            temp_fd, temp_path = tempfile.mkstemp()
-            try:
-                with os.fdopen(temp_fd, "wb") as temp_file:
-                    temp_file.write(file_stream.read())
-                try:
-                    result = self.convert(temp_path, **kwargs)
-                    used_legacy = True
-                except NotImplementedError:
-                    raise NotImplementedError(
-                        "Subclasses must implement the convert_stream method."
-                    )
-            finally:
-                os.remove(temp_path)
-                file_stream.seek(0)
-
-        if used_legacy:
-            message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
-            if message not in _WARNED:
-                warn(message, DeprecationWarning)
-                _WARNED.append(message)
-
-        return result
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        """
-        Legacy, and deprecated method to convert a document to Markdown text.
-        This method reads from the file at `local_path` and returns the converted Markdown text.
-        This method is deprecated in favor of `convert_stream`, which uses a file-like object.
-        """
        raise NotImplementedError("Subclasses must implement this method")

    @property
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -414,8 +414,16 @@ class MarkItDown:
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
        sorted_converters = sorted(self._converters, key=lambda x: x.priority)

+        # Remember the initial stream position so that we can return to it
+        cur_pos = file_stream.tell()
+
        for stream_info in stream_info_guesses + [StreamInfo()]:
            for converter in sorted_converters:
+                # Sanity check -- make sure the cur_pos is still the same
+                assert (
+                    cur_pos == file_stream.tell()
+                ), f"File stream position should NOT change between guess iterations"
+
                _kwargs = copy.deepcopy(kwargs)

                # Copy any additional global options
@ -442,17 +450,29 @@ class MarkItDown:
                    if stream_info.url is not None:
                        _kwargs["url"] = stream_info.url

-                # Attempt the conversion
-                cur_pos = file_stream.tell()
+                # Check if the converter will accept the file, and if so, try to convert it
+                _accepts = False
                try:
-                    res = converter.convert_stream(file_stream, stream_info, **_kwargs)
-                except Exception:
-                    failed_attempts.append(
-                        FailedConversionAttempt(
-                            converter=converter, exc_info=sys.exc_info()
-                        )
-                    )
-                finally:
+                    _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
+                except NotImplementedError:
+                    pass
+
+                # accept() should not have changed the file stream position
+                assert (
+                    cur_pos == file_stream.tell()
+                ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
+
+                # Attempt the conversion
+                if _accepts:
+                    # try:
+                    res = converter.convert(file_stream, stream_info, **_kwargs)
+                    # except Exception:
+                    #    failed_attempts.append(
+                    #        FailedConversionAttempt(
+                    #            converter=converter, exc_info=sys.exc_info()
+                    #        )
+                    #    )
+                    # finally:
                    file_stream.seek(cur_pos)

                if res is not None:
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -1,14 +1,24 @@
-# type: ignore
-import base64
+import io
 import re
-
-from typing import Union
+import base64
 from urllib.parse import parse_qs, urlparse
+from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify

+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/html",
+    "application/xhtml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+    ".html",
+    ".htm",
+]
+

 class BingSerpConverter(DocumentConverter):
    """
@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a Bing SERP
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        """
+        Make sure we're dealing with HTML content *from* Bing.
+        """

+        url = (stream_info.url or "").lower()
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
+            # Not a Bing SERP URL
+            return False
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        # Not HTML content
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Parse the query parameters
-        parsed_params = parse_qs(urlparse(url).query)
+        parsed_params = parse_qs(urlparse(stream_info.url).query)
        query = parsed_params.get("q", [""])[0]

-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        # Parse the stream
+        soup = BeautifulSoup(file_stream, "html.parser")

        # Clean up some formatting
        for tptt in soup.find_all(class_="tptt"):
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -1,9 +1,10 @@
 import sys

-from typing import Union
+from typing import BinaryIO, Any

-from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
@ -16,6 +17,13 @@ except ImportError:
    _dependency_exc_info = sys.exc_info()


+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".docx"]
+
+
 class DocxConverter(HtmlConverter):
    """
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter):
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
+        self._html_converter = HtmlConverter()

-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a DOCX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".docx":
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()

+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Check: the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter):
                _dependency_exc_info[2]
            )  # Restore the original traceback

-        result = None
-        with open(local_path, "rb") as docx_file:
-            style_map = kwargs.get("style_map", None)
-
-            result = mammoth.convert_to_html(docx_file, style_map=style_map)
-            html_content = result.value
-            result = self._convert(html_content)
-
-        return result
+        style_map = kwargs.get("style_map", None)
+        return self._html_converter.convert_string(
+            mammoth.convert_to_html(file_stream, style_map=style_map).value
+        )
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -1,4 +1,5 @@
-from typing import Any, Union, BinaryIO
+import io
+from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

-    def convert_stream(
+    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not html
-        if not self._is_html(stream_info):
-            return None
-
-        # Read the stream into a string
-        html_content = str(
-            file_stream.read(),
-            encoding=stream_info.charset if stream_info.charset else "utf-8",
-        )
-        return self._convert(html_content)
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not html
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
-            return None
-
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
-
-        return result
-
-    def _is_html(self, stream_info: StreamInfo) -> bool:
-        """Helper function that checks if the stream is html."""
+    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter):

        return False

-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts an HTML string."""
-
-        # Parse the string
-        soup = BeautifulSoup(html_content, "html.parser")
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Parse the stream
+        soup = BeautifulSoup(file_stream, "html.parser")

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter):
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
        )
+
+    def convert_string(
+        self, html_content: str, *, url: Optional[str] = None, **kwargs
+    ) -> DocumentConverterResult:
+        """
+        Non-standard convenience method to convert a string to markdown.
+        Given that many converters produce HTML as intermediate output, this
+        allows for easy conversion of HTML to markdown.
+        """
+        return self.convert(
+            file_stream=io.BytesIO(html_content.encode("utf-8")),
+            stream_info=StreamInfo(
+                mimetype="text/html",
+                extension=".html",
+                charset="utf-8",
+                url=url,
+            ),
+            **kwargs,
+        )
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -1,12 +1,13 @@
+import sys
 import base64
 import re
 import html
-import sys

-from typing import Union
+from typing import BinaryIO, Any

-from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
@ -19,7 +20,14 @@ except ImportError:
    _dependency_exc_info = sys.exc_info()


-class PptxConverter(HtmlConverter):
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/vnd.openxmlformats-officedocument.presentationml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".pptx"]
+
+
+class PptxConverter(DocumentConverter):
    """
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """
@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter):
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
+        self._html_converter = HtmlConverter()

    def _get_llm_description(
        self, llm_client, llm_model, image_blob, content_type, prompt=None
@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter):
        )
        return response.choices[0].message.content

-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a PPTX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".pptx":
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()

+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Check the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter):
                _dependency_exc_info[2]
            )  # Restore the original traceback

-        presentation = pptx.Presentation(local_path)
+        # Perform the conversion
+        presentation = pptx.Presentation(file_stream)
        md_content = ""
        slide_num = 0
        for slide in presentation.slides:
@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter):

                # Tables
                if self._is_table(shape):
-                    html_table = "<html><body><table>"
-                    first_row = True
-                    for row in shape.table.rows:
-                        html_table += "<tr>"
-                        for cell in row.cells:
-                            if first_row:
-                                html_table += "<th>" + html.escape(cell.text) + "</th>"
-                            else:
-                                html_table += "<td>" + html.escape(cell.text) + "</td>"
-                        html_table += "</tr>"
-                        first_row = False
-                    html_table += "</table></body></html>"
-                    md_content += (
-                        "\n" + self._convert(html_table).text_content.strip() + "\n"
-                    )
+                    md_content += self._convert_table_to_markdown(shape.table)

                # Charts
                if shape.has_chart:
@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter):
            return True
        return False

+    def _convert_table_to_markdown(self, table):
+        # Write the table as HTML, then convert it to Markdown
+        html_table = "<html><body><table>"
+        first_row = True
+        for row in table.rows:
+            html_table += "<tr>"
+            for cell in row.cells:
+                if first_row:
+                    html_table += "<th>" + html.escape(cell.text) + "</th>"
+                else:
+                    html_table += "<td>" + html.escape(cell.text) + "</td>"
+            html_table += "</tr>"
+            first_row = False
+        html_table += "</table></body></html>"
+
+        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
+
    def _convert_chart_to_markdown(self, chart):
        md = "\n\n### Chart"
        if chart.has_title:
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -1,11 +1,22 @@
+import io
 import re
-
-from typing import Any, Union
+from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify

+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/html",
+    "application/xhtml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+    ".html",
+    ".htm",
+]
+

 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""
@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not Wikipedia
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        """
+        Make sure we're dealing with HTML content *from* Wikipedia.
+        """

-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        url = (stream_info.url or "").lower()
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+            # Not a Wikipedia URL
+            return False
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        # Not HTML content
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Parse the stream
+        soup = BeautifulSoup(file_stream, "html.parser")

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -1,10 +1,9 @@
 import sys
-
-from typing import Union
-
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from .._stream_info import StreamInfo

 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
@ -22,8 +21,19 @@ try:
 except ImportError:
    _xls_dependency_exc_info = sys.exc_info()

+ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+]
+ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]

-class XlsxConverter(HtmlConverter):
+ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
+    "application/vnd.ms-excel",
+    "application/excel",
+]
+ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
+
+
+class XlsxConverter(DocumentConverter):
    """
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """
@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter):
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
+        self._html_converter = HtmlConverter()

-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".xlsx":
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()

+        if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Check the dependencies
        if _xlsx_dependency_exc_info is not None:
            raise MissingDependencyException(
@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter):
                _xlsx_dependency_exc_info[2]
            )  # Restore the original traceback

-        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+            md_content += (
+                self._html_converter.convert_string(html_content).markdown.strip()
+                + "\n\n"
+            )

        return DocumentConverterResult(markdown=md_content.strip())


-class XlsConverter(HtmlConverter):
+class XlsConverter(DocumentConverter):
    """
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLS
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".xls":
-            return None
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+        self._html_converter = HtmlConverter()

+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Load the dependencies
        if _xls_dependency_exc_info is not None:
            raise MissingDependencyException(
@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter):
                _xls_dependency_exc_info[2]
            )  # Restore the original traceback

-        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+        sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+            md_content += (
+                self._html_converter.convert_string(html_content).markdown.strip()
+                + "\n\n"
+            )

        return DocumentConverterResult(markdown=md_content.strip())