Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.

2025-03-05 20:09:18 -08:00 · 2025-03-05 20:09:18 -08:00 · a7ae7c53d8
commit a7ae7c53d8
parent 84f8198d8a
20 changed files with 82 additions and 129 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -3,7 +3,11 @@
 # SPDX-License-Identifier: MIT
 from .__about__ import __version__
-from ._markitdown import MarkItDown
+from ._markitdown import (
    MarkItDown,
    PRIORITY_SPECIFIC_FILE_FORMAT,
    PRIORITY_GENERIC_FILE_FORMAT,
 )
 from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
@ -25,4 +29,6 @@ __all__ = [
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
    "PRIORITY_SPECIFIC_FILE_FORMAT",
    "PRIORITY_GENERIC_FILE_FORMAT",
 ]
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -45,38 +45,6 @@ class DocumentConverterResult:
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    # Lower priority values are tried first.
    PRIORITY_SPECIFIC_FILE_FORMAT = (
        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
    )
    PRIORITY_GENERIC_FILE_FORMAT = (
        10.0  # Near catch-all converters for mimetypes like text/*, etc.
    )
    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
        """
        Initialize the DocumentConverter with a given priority.
        Priorities work as follows: By default, most converters get priority
        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
        with lower values being tried first (i.e., higher priority).
        Just prior to conversion, the converters are sorted by priority, using
        a stable sort. This means that converters with the same priority will
        remain in the same order, with the most recently registered converters
        appearing first.
        We have tight control over the order of built-in converters, but
        plugins can register converters in any order. A converter's priority
        field reasserts some control over the order of converters.
        Plugins can register converters with any priority, to appear before or
        after the built-ins. For example, a plugin with priority 9 will run
        before the PlainTextConverter, but after the built-in converters.
        """
        self._priority = priority
    def accepts(
        self,
        file_stream: BinaryIO,
@ -138,12 +106,3 @@ class DocumentConverter:
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
        raise NotImplementedError("Subclasses must implement this method")
    @property
    def priority(self) -> float:
        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
        return self._priority
    @priority.setter
    def priority(self, value: float):
        self._priority = value
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -7,6 +7,7 @@ import tempfile
 import warnings
 import traceback
 import io
 from dataclasses import dataclass
 from importlib.metadata import entry_points
 from typing import Any, List, Optional, Union, BinaryIO
 from pathlib import Path
@ -47,8 +48,15 @@ from ._exceptions import (
    FailedConversionAttempt,
 )
-# Override mimetype for csv to fix issue on windows
+
-mimetypes.add_type("text/csv", ".csv")
+# Lower priority values are tried first.
 PRIORITY_SPECIFIC_FILE_FORMAT = (
    0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
 )
 PRIORITY_GENERIC_FILE_FORMAT = (
    10.0  # Near catch-all converters for mimetypes like text/*, etc.
 )
 _plugins: List[Any] = []
@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]:
    return _plugins
@dataclass(kw_only=True, frozen=True)
 class ConverterRegistration:
    """A registration of a converter with its priority and other metadata."""
    converter: DocumentConverter
    priority: float
 class MarkItDown:
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
    This reader will convert common file-types or webpages to Markdown."""
@ -100,7 +116,7 @@ class MarkItDown:
        self._style_map: Union[str | None] = None
        # Register the converters
-        self._converters: List[DocumentConverter] = []
+        self._converters: List[ConverterRegistration] = []
        if (
            enable_builtins is None or enable_builtins
@ -128,9 +144,15 @@ class MarkItDown:
            # Register converters for successful browsing operations
            # Later registrations are tried first / take higher priority than earlier registrations
            # To this end, the most specific converters should appear below the most generic converters
-            self.register_converter(PlainTextConverter())
+            self.register_converter(
-            self.register_converter(ZipConverter(markitdown=self))
+                PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
-            self.register_converter(HtmlConverter())
+            )
            self.register_converter(
                ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
            )
            self.register_converter(
                HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
            )
            self.register_converter(RssConverter())
            self.register_converter(WikipediaConverter())
            self.register_converter(YouTubeConverter())
@ -418,13 +440,14 @@ class MarkItDown:
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-        sorted_converters = sorted(self._converters, key=lambda x: x.priority)
+        sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
        # Remember the initial stream position so that we can return to it
        cur_pos = file_stream.tell()
        for stream_info in stream_info_guesses + [StreamInfo()]:
-            for converter in sorted_converters:
+            for converter_registration in sorted_registrations:
                converter = converter_registration.converter
                # Sanity check -- make sure the cur_pos is still the same
                assert (
                    cur_pos == file_stream.tell()
@ -506,6 +529,34 @@ class MarkItDown:
        )
        self.register_converter(converter)
-    def register_converter(self, converter: DocumentConverter) -> None:
+    def register_converter(
-        """Register a page text converter."""
+        self,
-        self._converters.insert(0, converter)
+        converter: DocumentConverter,
        *,
        priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
    ) -> None:
        """
        Register a DocumentConverter with a given priority.
        Priorities work as follows: By default, most converters get priority
        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
        is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
        priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
        being tried first (i.e., higher priority).
        Just prior to conversion, the converters are sorted by priority, using
        a stable sort. This means that converters with the same priority will
        remain in the same order, with the most recently registered converters
        appearing first.
        We have tight control over the order of built-in converters, but
        plugins can register converters in any order. The registration's priority
        field reasserts some control over the order of converters.
        Plugins can register converters with any priority, to appear before or
        after the built-ins. For example, a plugin with priority 9 will run
        before the PlainTextConverter, but after the built-in converters.
        """
        self._converters.insert(
            0, ConverterRegistration(converter=converter, priority=priority)
        )
--- a/packages/markitdown/src/markitdown/converters/_audio_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py
@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter):
    Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter):
    NOTE: It is better to use the Bing API
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
    def __init__(
        self,
        *,
        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
-        super().__init__(priority=priority)
+        super().__init__()
        # Raise an error if the dependencies are not available.
        # This is different than other converters since this one isn't even instantiated
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter):
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """
-    def __init__(
+    def __init__(self):
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+        super().__init__()
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
    def accepts(
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter):
    Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
 class IpynbConverter(DocumentConverter):
    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter):
    - Email body content
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter):
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter):
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """
-    def __init__(
+    def __init__(self):
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+        super().__init__()
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
    def accepts(
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter):
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """
-    def __init__(
+    def __init__(self):
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+        super().__init__()
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
    def accepts(
@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter):
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """
-    def __init__(
+    def __init__(self):
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+        super().__init__()
    ):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
    def accepts(
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class YouTubeConverter(DocumentConverter):
    """Handle YouTube specially, focusing on the video title, description, and transcript."""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter):
    def __init__(
        self,
        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        *,
        markitdown: "MarkItDown",
    ):
-        super().__init__(priority=priority)
+        super().__init__()
        self._markitdown = markitdown
    def accepts(
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None:
    finally:
        warnings.resetwarnings()
    # Test explicitly setting the location of exiftool
    which_exiftool = shutil.which("exiftool")
    assert which_exiftool is not None
    # Test explicitly setting the location of exiftool
    markitdown = MarkItDown(exiftool_path=which_exiftool)
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL: