Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.

2025-03-05 20:09:18 -08:00 · 2025-03-05 20:09:18 -08:00 · a7ae7c53d8
commit a7ae7c53d8
parent 84f8198d8a
20 changed files with 82 additions and 129 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -3,7 +3,11 @@
 # SPDX-License-Identifier: MIT

 from .__about__ import __version__
-from ._markitdown import MarkItDown
+from ._markitdown import (
+    MarkItDown,
+    PRIORITY_SPECIFIC_FILE_FORMAT,
+    PRIORITY_GENERIC_FILE_FORMAT,
+)
 from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
@ -25,4 +29,6 @@ __all__ = [
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
+    "PRIORITY_SPECIFIC_FILE_FORMAT",
+    "PRIORITY_GENERIC_FILE_FORMAT",
 ]
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -45,38 +45,6 @@ class DocumentConverterResult:
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""

-    # Lower priority values are tried first.
-    PRIORITY_SPECIFIC_FILE_FORMAT = (
-        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
-    )
-    PRIORITY_GENERIC_FILE_FORMAT = (
-        10.0  # Near catch-all converters for mimetypes like text/*, etc.
-    )
-
-    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
-        """
-        Initialize the DocumentConverter with a given priority.
-
-        Priorities work as follows: By default, most converters get priority
-        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
-        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
-        with lower values being tried first (i.e., higher priority).
-
-        Just prior to conversion, the converters are sorted by priority, using
-        a stable sort. This means that converters with the same priority will
-        remain in the same order, with the most recently registered converters
-        appearing first.
-
-        We have tight control over the order of built-in converters, but
-        plugins can register converters in any order. A converter's priority
-        field reasserts some control over the order of converters.
-
-        Plugins can register converters with any priority, to appear before or
-        after the built-ins. For example, a plugin with priority 9 will run
-        before the PlainTextConverter, but after the built-in converters.
-        """
-        self._priority = priority
-
    def accepts(
        self,
        file_stream: BinaryIO,
@ -138,12 +106,3 @@ class DocumentConverter:
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
        raise NotImplementedError("Subclasses must implement this method")
-
-    @property
-    def priority(self) -> float:
-        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
-        return self._priority
-
-    @priority.setter
-    def priority(self, value: float):
-        self._priority = value
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -7,6 +7,7 @@ import tempfile
 import warnings
 import traceback
 import io
+from dataclasses import dataclass
 from importlib.metadata import entry_points
 from typing import Any, List, Optional, Union, BinaryIO
 from pathlib import Path
@ -47,8 +48,15 @@ from ._exceptions import (
    FailedConversionAttempt,
 )

-# Override mimetype for csv to fix issue on windows
-mimetypes.add_type("text/csv", ".csv")
+
+# Lower priority values are tried first.
+PRIORITY_SPECIFIC_FILE_FORMAT = (
+    0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
+)
+PRIORITY_GENERIC_FILE_FORMAT = (
+    10.0  # Near catch-all converters for mimetypes like text/*, etc.
+)
+

 _plugins: List[Any] = []

@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]:
    return _plugins


+@dataclass(kw_only=True, frozen=True)
+class ConverterRegistration:
+    """A registration of a converter with its priority and other metadata."""
+
+    converter: DocumentConverter
+    priority: float
+
+
 class MarkItDown:
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
    This reader will convert common file-types or webpages to Markdown."""
@ -100,7 +116,7 @@ class MarkItDown:
        self._style_map: Union[str | None] = None

        # Register the converters
-        self._converters: List[DocumentConverter] = []
+        self._converters: List[ConverterRegistration] = []

        if (
            enable_builtins is None or enable_builtins
@ -128,9 +144,15 @@ class MarkItDown:
            # Register converters for successful browsing operations
            # Later registrations are tried first / take higher priority than earlier registrations
            # To this end, the most specific converters should appear below the most generic converters
-            self.register_converter(PlainTextConverter())
-            self.register_converter(ZipConverter(markitdown=self))
-            self.register_converter(HtmlConverter())
+            self.register_converter(
+                PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
+            self.register_converter(
+                ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
+            self.register_converter(
+                HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
            self.register_converter(RssConverter())
            self.register_converter(WikipediaConverter())
            self.register_converter(YouTubeConverter())
@ -418,13 +440,14 @@ class MarkItDown:
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-        sorted_converters = sorted(self._converters, key=lambda x: x.priority)
+        sorted_registrations = sorted(self._converters, key=lambda x: x.priority)

        # Remember the initial stream position so that we can return to it
        cur_pos = file_stream.tell()

        for stream_info in stream_info_guesses + [StreamInfo()]:
-            for converter in sorted_converters:
+            for converter_registration in sorted_registrations:
+                converter = converter_registration.converter
                # Sanity check -- make sure the cur_pos is still the same
                assert (
                    cur_pos == file_stream.tell()
@ -506,6 +529,34 @@ class MarkItDown:
        )
        self.register_converter(converter)

-    def register_converter(self, converter: DocumentConverter) -> None:
-        """Register a page text converter."""
-        self._converters.insert(0, converter)
+    def register_converter(
+        self,
+        converter: DocumentConverter,
+        *,
+        priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
+    ) -> None:
+        """
+        Register a DocumentConverter with a given priority.
+
+        Priorities work as follows: By default, most converters get priority
+        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
+        is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
+        priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
+        being tried first (i.e., higher priority).
+
+        Just prior to conversion, the converters are sorted by priority, using
+        a stable sort. This means that converters with the same priority will
+        remain in the same order, with the most recently registered converters
+        appearing first.
+
+        We have tight control over the order of built-in converters, but
+        plugins can register converters in any order. The registration's priority
+        field reasserts some control over the order of converters.
+
+        Plugins can register converters with any priority, to appear before or
+        after the built-ins. For example, a plugin with priority 9 will run
+        before the PlainTextConverter, but after the built-in converters.
+        """
+        self._converters.insert(
+            0, ConverterRegistration(converter=converter, priority=priority)
+        )
--- a/packages/markitdown/src/markitdown/converters/_audio_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py
@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter):
    Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter):
    NOTE: It is better to use the Bing API
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
    def __init__(
        self,
        *,
-        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
-        super().__init__(priority=priority)
+        super().__init__()

        # Raise an error if the dependencies are not available.
        # This is different than other converters since this one isn't even instantiated
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter):
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
        self._html_converter = HtmlConverter()

    def accepts(
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter):
    Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
 class IpynbConverter(DocumentConverter):
    """Converts Jupyter Notebook (.ipynb) files to Markdown."""

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter):
    - Email body content
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter):
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter):
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
        self._html_converter = HtmlConverter()

    def accepts(
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter):
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
        self._html_converter = HtmlConverter()

    def accepts(
@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter):
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
        self._html_converter = HtmlConverter()

    def accepts(
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class YouTubeConverter(DocumentConverter):
    """Handle YouTube specially, focusing on the video title, description, and transcript."""

-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter):

    def __init__(
        self,
-        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        *,
        markitdown: "MarkItDown",
    ):
-        super().__init__(priority=priority)
+        super().__init__()
        self._markitdown = markitdown

    def accepts(
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None:
    finally:
        warnings.resetwarnings()

-    # Test explicitly setting the location of exiftool
    which_exiftool = shutil.which("exiftool")
+    assert which_exiftool is not None
+
+    # Test explicitly setting the location of exiftool
    markitdown = MarkItDown(exiftool_path=which_exiftool)
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL: