From a7ae7c53d890c4ea2fc17192bfd6af562ff38b54 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 5 Mar 2025 20:09:18 -0800
Subject: [PATCH] Move priority to outside DocumentConverter, allowing them to
 be reprioritized, and keeping the DocumentConverter interface simple.

---
 .../markitdown/src/markitdown/__init__.py     |  8 +-
 .../src/markitdown/_base_converter.py         | 41 -----------
 .../markitdown/src/markitdown/_markitdown.py  | 73 ++++++++++++++++---
 .../markitdown/converters/_audio_converter.py |  5 --
 .../converters/_bing_serp_converter.py        |  5 --
 .../converters/_doc_intel_converter.py        |  3 +-
 .../markitdown/converters/_docx_converter.py  |  6 +-
 .../markitdown/converters/_html_converter.py  |  5 --
 .../markitdown/converters/_image_converter.py |  5 --
 .../markitdown/converters/_ipynb_converter.py |  5 --
 .../converters/_outlook_msg_converter.py      |  5 --
 .../markitdown/converters/_pdf_converter.py   |  5 --
 .../converters/_plain_text_converter.py       |  5 --
 .../markitdown/converters/_pptx_converter.py  |  6 +-
 .../markitdown/converters/_rss_converter.py   |  5 --
 .../converters/_wikipedia_converter.py        |  5 --
 .../markitdown/converters/_xlsx_converter.py  | 12 +--
 .../converters/_youtube_converter.py          |  5 --
 .../markitdown/converters/_zip_converter.py   |  3 +-
 packages/markitdown/tests/test_markitdown.py  |  4 +-
 20 files changed, 82 insertions(+), 129 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py
index bb6fcdb..af356dd 100644
--- a/packages/markitdown/src/markitdown/__init__.py
+++ b/packages/markitdown/src/markitdown/__init__.py
@@ -3,7 +3,11 @@
 # SPDX-License-Identifier: MIT
 
 from .__about__ import __version__
-from ._markitdown import MarkItDown
+from ._markitdown import (
+    MarkItDown,
+    PRIORITY_SPECIFIC_FILE_FORMAT,
+    PRIORITY_GENERIC_FILE_FORMAT,
+)
 from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
@@ -25,4 +29,6 @@ __all__ = [
     "FileConversionException",
     "UnsupportedFormatException",
     "StreamInfo",
+    "PRIORITY_SPECIFIC_FILE_FORMAT",
+    "PRIORITY_GENERIC_FILE_FORMAT",
 ]
diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py
index f4fb3a1..2f0ca9d 100644
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@@ -45,38 +45,6 @@ class DocumentConverterResult:
 class DocumentConverter:
     """Abstract superclass of all DocumentConverters."""
 
-    # Lower priority values are tried first.
-    PRIORITY_SPECIFIC_FILE_FORMAT = (
-        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
-    )
-    PRIORITY_GENERIC_FILE_FORMAT = (
-        10.0  # Near catch-all converters for mimetypes like text/*, etc.
-    )
-
-    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
-        """
-        Initialize the DocumentConverter with a given priority.
-
-        Priorities work as follows: By default, most converters get priority
-        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
-        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
-        with lower values being tried first (i.e., higher priority).
-
-        Just prior to conversion, the converters are sorted by priority, using
-        a stable sort. This means that converters with the same priority will
-        remain in the same order, with the most recently registered converters
-        appearing first.
-
-        We have tight control over the order of built-in converters, but
-        plugins can register converters in any order. A converter's priority
-        field reasserts some control over the order of converters.
-
-        Plugins can register converters with any priority, to appear before or
-        after the built-ins. For example, a plugin with priority 9 will run
-        before the PlainTextConverter, but after the built-in converters.
-        """
-        self._priority = priority
-
     def accepts(
         self,
         file_stream: BinaryIO,
@@ -138,12 +106,3 @@ class DocumentConverter:
         - MissingDependencyException: If the converter requires a dependency that is not installed.
         """
         raise NotImplementedError("Subclasses must implement this method")
-
-    @property
-    def priority(self) -> float:
-        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
-        return self._priority
-
-    @priority.setter
-    def priority(self, value: float):
-        self._priority = value
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index a51f227..6086eb9 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -7,6 +7,7 @@ import tempfile
 import warnings
 import traceback
 import io
+from dataclasses import dataclass
 from importlib.metadata import entry_points
 from typing import Any, List, Optional, Union, BinaryIO
 from pathlib import Path
@@ -47,8 +48,15 @@ from ._exceptions import (
     FailedConversionAttempt,
 )
 
-# Override mimetype for csv to fix issue on windows
-mimetypes.add_type("text/csv", ".csv")
+
+# Lower priority values are tried first.
+PRIORITY_SPECIFIC_FILE_FORMAT = (
+    0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
+)
+PRIORITY_GENERIC_FILE_FORMAT = (
+    10.0  # Near catch-all converters for mimetypes like text/*, etc.
+)
+
 
 _plugins: List[Any] = []
 
@@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]:
     return _plugins
 
 
+@dataclass(kw_only=True, frozen=True)
+class ConverterRegistration:
+    """A registration of a converter with its priority and other metadata."""
+
+    converter: DocumentConverter
+    priority: float
+
+
 class MarkItDown:
     """(In preview) An extremely simple text-based document reader, suitable for LLM use.
     This reader will convert common file-types or webpages to Markdown."""
@@ -100,7 +116,7 @@ class MarkItDown:
         self._style_map: Union[str | None] = None
 
         # Register the converters
-        self._converters: List[DocumentConverter] = []
+        self._converters: List[ConverterRegistration] = []
 
         if (
             enable_builtins is None or enable_builtins
@@ -128,9 +144,15 @@ class MarkItDown:
             # Register converters for successful browsing operations
             # Later registrations are tried first / take higher priority than earlier registrations
             # To this end, the most specific converters should appear below the most generic converters
-            self.register_converter(PlainTextConverter())
-            self.register_converter(ZipConverter(markitdown=self))
-            self.register_converter(HtmlConverter())
+            self.register_converter(
+                PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
+            self.register_converter(
+                ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
+            self.register_converter(
+                HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
             self.register_converter(RssConverter())
             self.register_converter(WikipediaConverter())
             self.register_converter(YouTubeConverter())
@@ -418,13 +440,14 @@ class MarkItDown:
         # Create a copy of the page_converters list, sorted by priority.
         # We do this with each call to _convert because the priority of converters may change between calls.
         # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-        sorted_converters = sorted(self._converters, key=lambda x: x.priority)
+        sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
 
         # Remember the initial stream position so that we can return to it
         cur_pos = file_stream.tell()
 
         for stream_info in stream_info_guesses + [StreamInfo()]:
-            for converter in sorted_converters:
+            for converter_registration in sorted_registrations:
+                converter = converter_registration.converter
                 # Sanity check -- make sure the cur_pos is still the same
                 assert (
                     cur_pos == file_stream.tell()
@@ -506,6 +529,34 @@ class MarkItDown:
         )
         self.register_converter(converter)
 
-    def register_converter(self, converter: DocumentConverter) -> None:
-        """Register a page text converter."""
-        self._converters.insert(0, converter)
+    def register_converter(
+        self,
+        converter: DocumentConverter,
+        *,
+        priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
+    ) -> None:
+        """
+        Register a DocumentConverter with a given priority.
+
+        Priorities work as follows: By default, most converters get priority
+        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
+        is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
+        priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
+        being tried first (i.e., higher priority).
+
+        Just prior to conversion, the converters are sorted by priority, using
+        a stable sort. This means that converters with the same priority will
+        remain in the same order, with the most recently registered converters
+        appearing first.
+
+        We have tight control over the order of built-in converters, but
+        plugins can register converters in any order. The registration's priority
+        field reasserts some control over the order of converters.
+
+        Plugins can register converters with any priority, to appear before or
+        after the built-ins. For example, a plugin with priority 9 will run
+        before the PlainTextConverter, but after the built-in converters.
+        """
+        self._converters.insert(
+            0, ConverterRegistration(converter=converter, priority=priority)
+        )
diff --git a/packages/markitdown/src/markitdown/converters/_audio_converter.py b/packages/markitdown/src/markitdown/converters/_audio_converter.py
index d502deb..845ad5d 100644
--- a/packages/markitdown/src/markitdown/converters/_audio_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py
@@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter):
     Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
index 2e9913c..7dd9e24 100644
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter):
     NOTE: It is better to use the Bing API
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index 00ab0fc..2f116d0 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
     def __init__(
         self,
         *,
-        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
         endpoint: str,
         api_version: str = "2024-07-31-preview",
     ):
-        super().__init__(priority=priority)
+        super().__init__()
 
         # Raise an error if the dependencies are not available.
         # This is different than other converters since this one isn't even instantiated
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index a5090ac..c568acb 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter):
     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
         self._html_converter = HtmlConverter()
 
     def accepts(
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index 7d0c916..8a8203d 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class HtmlConverter(DocumentConverter):
     """Anything with content type text/html"""
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py
index e03dfe8..dd8fbac 100644
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter):
     Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
index 490e4e1..f8ba193 100644
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
 class IpynbConverter(DocumentConverter):
     """Converts Jupyter Notebook (.ipynb) files to Markdown."""
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
index cef3dc7..8a61b0c 100644
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter):
     - Email body content
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index 445dba3..4586ef1 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter):
     Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
index 92da511..4a21d3a 100644
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [
 class PlainTextConverter(DocumentConverter):
     """Anything with content type text/plain"""
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index e51739e..bea1226 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter):
     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
         self._html_converter = HtmlConverter()
 
     def accepts(
diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py
index 3074c6c..dbafc1b 100644
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
     """Convert RSS / Atom type to markdown"""
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
index c9176f6..5b054af 100644
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class WikipediaConverter(DocumentConverter):
     """Handle Wikipedia pages separately, focusing only on the main document content."""
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index f11af31..3d0e1ab 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter):
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
         self._html_converter = HtmlConverter()
 
     def accepts(
@@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter):
     Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
+    def __init__(self):
+        super().__init__()
         self._html_converter = HtmlConverter()
 
     def accepts(
diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
index 2efc6ea..5a158d5 100644
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [
 class YouTubeConverter(DocumentConverter):
     """Handle YouTube specially, focusing on the video title, description, and transcript."""
 
-    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
-    ):
-        super().__init__(priority=priority)
-
     def accepts(
         self,
         file_stream: BinaryIO,
diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py
index c60d94a..cb1a7e6 100644
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter):
 
     def __init__(
         self,
-        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
         *,
         markitdown: "MarkItDown",
     ):
-        super().__init__(priority=priority)
+        super().__init__()
         self._markitdown = markitdown
 
     def accepts(
diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py
index 8905253..8c34da0 100644
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None:
     finally:
         warnings.resetwarnings()
 
-    # Test explicitly setting the location of exiftool
     which_exiftool = shutil.which("exiftool")
+    assert which_exiftool is not None
+
+    # Test explicitly setting the location of exiftool
     markitdown = MarkItDown(exiftool_path=which_exiftool)
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
     for key in JPG_TEST_EXIFTOOL: