Updating converters.

2025-03-04 13:57:49 -08:00 · 2025-03-04 13:57:49 -08:00 · 4d09a4c6c6
commit 4d09a4c6c6
parent df372fa460
8 changed files with 366 additions and 207 deletions
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -80,23 +80,46 @@ class DocumentConverter:
        """
        self._priority = priority
-    def convert_stream(
+    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
-    ) -> Union[None, DocumentConverterResult]:
+    ) -> bool:
        """
-        Convert a document to Markdown text, or return None if the converter
+        Return a quick determination on if the converter should attempt converting the document.
-        cannot handle the document (causing the next converter to be tried).
+        This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
        In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to
        make a determination (e.g., special converters for Wikipedia, YouTube etc).
        Finally, it is conceivable that the `stream_info.filename` might be used to in cases
        where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
-        The determination of whether a converter can handle a document is primarily based on
+        NOTE: The method signature is designed to match that of the convert() method. This provides some
-        the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
+        assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
-        a secondary check if the MIME type is not sufficiently specific
+
-        (e.g., application/octet-stream). In the case of data retreived via HTTP, the
+        IMPORTANT: If this method advances the position in file_stream, it must also reset the position before
-        `steam_info.url` might also be referenced to guide conversion (e.g., special-handling
+        returning. This is because the convert() method may be called immediately after accepts().
-        for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
+
-        of the file content in cases of text/*
+        Prameters:
        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
        - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
        - kwargs: Additional keyword arguments for the converter.
        Returns:
        - bool: True if the converter can handle the document, False otherwise.
        """
        raise NotImplementedError(
            f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
        )
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        """
        Convert a document to Markdown text.
        Prameters:
        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
@ -105,68 +128,11 @@ class DocumentConverter:
        Returns:
        - DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
        or
        - None: If the converter cannot handle the document.
        Raises:
        - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
        # Default implementation ensures backward compatibility with the legacy convert() method, and
        # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
        # in the future.
        result = None
        used_legacy = False
        if stream_info.local_path is not None and os.path.exists(
            stream_info.local_path
        ):
            # If the stream is backed by a local file, pass it to the legacy convert() method
            try:
                result = self.convert(stream_info.local_path, **kwargs)
                used_legacy = True
            except (
                NotImplementedError
            ):  # If it wasn't implemented, rethrow the error, but with this as the stack trace
                raise NotImplementedError(
                    "Subclasses must implement the convert_stream method."
                )
        else:
            # Otherwise, we need to read the stream into a temporary file. There is potential for
            # thrashing here if there are many converters or conversion attempts
            cur_pos = file_stream.tell()
            temp_fd, temp_path = tempfile.mkstemp()
            try:
                with os.fdopen(temp_fd, "wb") as temp_file:
                    temp_file.write(file_stream.read())
                try:
                    result = self.convert(temp_path, **kwargs)
                    used_legacy = True
                except NotImplementedError:
                    raise NotImplementedError(
                        "Subclasses must implement the convert_stream method."
                    )
            finally:
                os.remove(temp_path)
                file_stream.seek(0)
        if used_legacy:
            message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
            if message not in _WARNED:
                warn(message, DeprecationWarning)
                _WARNED.append(message)
        return result
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        """
        Legacy, and deprecated method to convert a document to Markdown text.
        This method reads from the file at `local_path` and returns the converted Markdown text.
        This method is deprecated in favor of `convert_stream`, which uses a file-like object.
        """
        raise NotImplementedError("Subclasses must implement this method")
    @property
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -414,8 +414,16 @@ class MarkItDown:
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
        sorted_converters = sorted(self._converters, key=lambda x: x.priority)
        # Remember the initial stream position so that we can return to it
        cur_pos = file_stream.tell()
        for stream_info in stream_info_guesses + [StreamInfo()]:
            for converter in sorted_converters:
                # Sanity check -- make sure the cur_pos is still the same
                assert (
                    cur_pos == file_stream.tell()
                ), f"File stream position should NOT change between guess iterations"
                _kwargs = copy.deepcopy(kwargs)
                # Copy any additional global options
@ -442,17 +450,29 @@ class MarkItDown:
                    if stream_info.url is not None:
                        _kwargs["url"] = stream_info.url
-                # Attempt the conversion
+                # Check if the converter will accept the file, and if so, try to convert it
-                cur_pos = file_stream.tell()
+                _accepts = False
                try:
-                    res = converter.convert_stream(file_stream, stream_info, **_kwargs)
+                    _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
-                except Exception:
+                except NotImplementedError:
-                    failed_attempts.append(
+                    pass
-                        FailedConversionAttempt(
+
-                            converter=converter, exc_info=sys.exc_info()
+                # accept() should not have changed the file stream position
-                        )
+                assert (
-                    )
+                    cur_pos == file_stream.tell()
-                finally:
+                ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
                # Attempt the conversion
                if _accepts:
                    # try:
                    res = converter.convert(file_stream, stream_info, **_kwargs)
                    # except Exception:
                    #    failed_attempts.append(
                    #        FailedConversionAttempt(
                    #            converter=converter, exc_info=sys.exc_info()
                    #        )
                    #    )
                    # finally:
                    file_stream.seek(cur_pos)
                if res is not None:
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -1,14 +1,24 @@
-# type: ignore
+import io
 import base64
 import re
-
+import base64
 from typing import Union
 from urllib.parse import parse_qs, urlparse
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/html",
    "application/xhtml",
 ]
 ACCEPTED_FILE_EXTENSIONS = [
    ".html",
    ".htm",
 ]
 class BingSerpConverter(DocumentConverter):
    """
@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def accepts(
-        # Bail if not a Bing SERP
+        self,
-        extension = kwargs.get("file_extension", "")
+        file_stream: BinaryIO,
-        if extension.lower() not in [".html", ".htm"]:
+        stream_info: StreamInfo,
-            return None
+        **kwargs: Any,  # Options to pass to the converter
-        url = kwargs.get("url", "")
+    ) -> bool:
-        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
+        """
-            return None
+        Make sure we're dealing with HTML content *from* Bing.
        """
        url = (stream_info.url or "").lower()
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
            # Not a Bing SERP URL
            return False
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        # Not HTML content
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the query parameters
-        parsed_params = parse_qs(urlparse(url).query)
+        parsed_params = parse_qs(urlparse(stream_info.url).query)
        query = parsed_params.get("q", [""])[0]
-        # Parse the file
+        # Parse the stream
-        soup = None
+        soup = BeautifulSoup(file_stream, "html.parser")
        with open(local_path, "rt", encoding="utf-8") as fh:
            soup = BeautifulSoup(fh.read(), "html.parser")
        # Clean up some formatting
        for tptt in soup.find_all(class_="tptt"):
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -1,9 +1,10 @@
 import sys
-from typing import Union
+from typing import BinaryIO, Any
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
@ -16,6 +17,13 @@ except ImportError:
    _dependency_exc_info = sys.exc_info()
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".docx"]
 class DocxConverter(HtmlConverter):
    """
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter):
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def accepts(
-        # Bail if not a DOCX
+        self,
-        extension = kwargs.get("file_extension", "")
+        file_stream: BinaryIO,
-        if extension.lower() != ".docx":
+        stream_info: StreamInfo,
-            return None
+        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Check: the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter):
                _dependency_exc_info[2]
            )  # Restore the original traceback
-        result = None
+        style_map = kwargs.get("style_map", None)
-        with open(local_path, "rb") as docx_file:
+        return self._html_converter.convert_string(
-            style_map = kwargs.get("style_map", None)
+            mammoth.convert_to_html(file_stream, style_map=style_map).value
-
+        )
            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html_content = result.value
            result = self._convert(html_content)
        return result
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -1,4 +1,5 @@
-from typing import Any, Union, BinaryIO
+import io
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def convert_stream(
+    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
-    ) -> Union[None, DocumentConverterResult]:
+    ) -> bool:
        # Bail if not html
        if not self._is_html(stream_info):
            return None
        # Read the stream into a string
        html_content = str(
            file_stream.read(),
            encoding=stream_info.charset if stream_info.charset else "utf-8",
        )
        return self._convert(html_content)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not html
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
            return None
        result = None
        with open(local_path, "rt", encoding="utf-8") as fh:
            result = self._convert(fh.read())
        return result
    def _is_html(self, stream_info: StreamInfo) -> bool:
        """Helper function that checks if the stream is html."""
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter):
        return False
-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+    def convert(
-        """Helper function that converts an HTML string."""
+        self,
-
+        file_stream: BinaryIO,
-        # Parse the string
+        stream_info: StreamInfo,
-        soup = BeautifulSoup(html_content, "html.parser")
+        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the stream
        soup = BeautifulSoup(file_stream, "html.parser")
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter):
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
        )
    def convert_string(
        self, html_content: str, *, url: Optional[str] = None, **kwargs
    ) -> DocumentConverterResult:
        """
        Non-standard convenience method to convert a string to markdown.
        Given that many converters produce HTML as intermediate output, this
        allows for easy conversion of HTML to markdown.
        """
        return self.convert(
            file_stream=io.BytesIO(html_content.encode("utf-8")),
            stream_info=StreamInfo(
                mimetype="text/html",
                extension=".html",
                charset="utf-8",
                url=url,
            ),
            **kwargs,
        )
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -1,12 +1,13 @@
 import sys
 import base64
 import re
 import html
 import sys
-from typing import Union
+from typing import BinaryIO, Any
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
@ -19,7 +20,14 @@ except ImportError:
    _dependency_exc_info = sys.exc_info()
-class PptxConverter(HtmlConverter):
+ACCEPTED_MIME_TYPE_PREFIXES = [
    "application/vnd.openxmlformats-officedocument.presentationml",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".pptx"]
 class PptxConverter(DocumentConverter):
    """
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """
@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter):
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
    def _get_llm_description(
        self, llm_client, llm_model, image_blob, content_type, prompt=None
@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter):
        )
        return response.choices[0].message.content
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def accepts(
-        # Bail if not a PPTX
+        self,
-        extension = kwargs.get("file_extension", "")
+        file_stream: BinaryIO,
-        if extension.lower() != ".pptx":
+        stream_info: StreamInfo,
-            return None
+        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Check the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter):
                _dependency_exc_info[2]
            )  # Restore the original traceback
-        presentation = pptx.Presentation(local_path)
+        # Perform the conversion
        presentation = pptx.Presentation(file_stream)
        md_content = ""
        slide_num = 0
        for slide in presentation.slides:
@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter):
                # Tables
                if self._is_table(shape):
-                    html_table = "<html><body><table>"
+                    md_content += self._convert_table_to_markdown(shape.table)
                    first_row = True
                    for row in shape.table.rows:
                        html_table += "<tr>"
                        for cell in row.cells:
                            if first_row:
                                html_table += "<th>" + html.escape(cell.text) + "</th>"
                            else:
                                html_table += "<td>" + html.escape(cell.text) + "</td>"
                        html_table += "</tr>"
                        first_row = False
                    html_table += "</table></body></html>"
                    md_content += (
                        "\n" + self._convert(html_table).text_content.strip() + "\n"
                    )
                # Charts
                if shape.has_chart:
@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter):
            return True
        return False
    def _convert_table_to_markdown(self, table):
        # Write the table as HTML, then convert it to Markdown
        html_table = "<html><body><table>"
        first_row = True
        for row in table.rows:
            html_table += "<tr>"
            for cell in row.cells:
                if first_row:
                    html_table += "<th>" + html.escape(cell.text) + "</th>"
                else:
                    html_table += "<td>" + html.escape(cell.text) + "</td>"
            html_table += "</tr>"
            first_row = False
        html_table += "</table></body></html>"
        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
    def _convert_chart_to_markdown(self, chart):
        md = "\n\n### Chart"
        if chart.has_title:
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -1,11 +1,22 @@
 import io
 import re
-
+from typing import Any, BinaryIO, Optional
 from typing import Any, Union
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/html",
    "application/xhtml",
 ]
 ACCEPTED_FILE_EXTENSIONS = [
    ".html",
    ".htm",
 ]
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""
@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def convert(
+    def accepts(
-        self, local_path: str, **kwargs: Any
+        self,
-    ) -> Union[None, DocumentConverterResult]:
+        file_stream: BinaryIO,
-        # Bail if not Wikipedia
+        stream_info: StreamInfo,
-        extension = kwargs.get("file_extension", "")
+        **kwargs: Any,  # Options to pass to the converter
-        if extension.lower() not in [".html", ".htm"]:
+    ) -> bool:
-            return None
+        """
-        url = kwargs.get("url", "")
+        Make sure we're dealing with HTML content *from* Wikipedia.
-        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+        """
            return None
-        # Parse the file
+        url = (stream_info.url or "").lower()
-        soup = None
+        mimetype = (stream_info.mimetype or "").lower()
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        extension = (stream_info.extension or "").lower()
-            soup = BeautifulSoup(fh.read(), "html.parser")
+
        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
            # Not a Wikipedia URL
            return False
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        # Not HTML content
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the stream
        soup = BeautifulSoup(file_stream, "html.parser")
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -1,10 +1,9 @@
 import sys
-
+from typing import BinaryIO, Any
 from typing import Union
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 from .._stream_info import StreamInfo
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
@ -22,8 +21,19 @@ try:
 except ImportError:
    _xls_dependency_exc_info = sys.exc_info()
 ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 ]
 ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
-class XlsxConverter(HtmlConverter):
+ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
    "application/vnd.ms-excel",
    "application/excel",
 ]
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 class XlsxConverter(DocumentConverter):
    """
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """
@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter):
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def accepts(
-        # Bail if not a XLSX
+        self,
-        extension = kwargs.get("file_extension", "")
+        file_stream: BinaryIO,
-        if extension.lower() != ".xlsx":
+        stream_info: StreamInfo,
-            return None
+        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Check the dependencies
        if _xlsx_dependency_exc_info is not None:
            raise MissingDependencyException(
@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter):
                _xlsx_dependency_exc_info[2]
            )  # Restore the original traceback
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+            md_content += (
                self._html_converter.convert_string(html_content).markdown.strip()
                + "\n\n"
            )
        return DocumentConverterResult(markdown=md_content.strip())
-class XlsConverter(HtmlConverter):
+class XlsConverter(DocumentConverter):
    """
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def __init__(
-        # Bail if not a XLS
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-        extension = kwargs.get("file_extension", "")
+    ):
-        if extension.lower() != ".xls":
+        super().__init__(priority=priority)
-            return None
+        self._html_converter = HtmlConverter()
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Load the dependencies
        if _xls_dependency_exc_info is not None:
            raise MissingDependencyException(
@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter):
                _xls_dependency_exc_info[2]
            )  # Restore the original traceback
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+        sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+            md_content += (
                self._html_converter.convert_string(html_content).markdown.strip()
                + "\n\n"
            )
        return DocumentConverterResult(markdown=md_content.strip())