Added priority argument to all converter constructors.

2025-02-11 10:13:36 -08:00 · 2025-02-11 10:13:36 -08:00 · 4298cfad8d
commit 4298cfad8d
parent 4b62506451
21 changed files with 128 additions and 15 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -47,10 +47,6 @@ from ._exceptions import (
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")

-PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
-PRIORITY_GENERIC_FILE_FORMAT = 10.0
-
-
 _plugins: Union[None | List[Any]] = None


@ -103,6 +99,23 @@ class MarkItDown:
        # Register the converters
        self._page_converters: List[DocumentConverter] = []

+        # Note: We have tight control over the order of built-in converters, but
+        # plugins can register converters in any order. A converter's .priority
+        # reasserts some control over the order of converters.
+        #
+        # Priorities work as follows. By default, most converters get priority
+        # DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
+        # is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
+        # with lower values being tried first (i.e., higher priority).
+        #
+        # Just prior to conversion, the converters are sorted by priority, using
+        # a stable sort. This means that converters with the same priority will
+        # remain in the same order, with the most recently registered converters
+        # appearing first.
+        #
+        # Plugins can register converters with any priority, to appear before or
+        # after the built-ins. For example, a plugin with priority 9 will run
+        # before the PlainTextConverter, but after the built-in converters.
        if (
            enable_builtins is None or enable_builtins
        ):  # Default to True when not specified
@ -123,6 +136,8 @@ class MarkItDown:
            self._llm_model = kwargs.get("llm_model")
            self._exiftool_path = kwargs.get("exiftool_path")
            self._style_map = kwargs.get("style_map")
+            if self._exiftool_path is None:
+                self._exiftool_path = os.getenv("EXIFTOOL_PATH")

            # Register converters for successful browsing operations
            # Later registrations are tried first / take higher priority than earlier registrations
--- a/packages/markitdown/src/markitdown/converters/_base.py
+++ b/packages/markitdown/src/markitdown/converters/_base.py
@ -12,7 +12,15 @@ class DocumentConverterResult:
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""

-    def __init__(self, priority: float = 0.0):
+    # Lower priority values are tried first.
+    PRIORITY_SPECIFIC_FILE_FORMAT = (
+        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
+    )
+    PRIORITY_GENERIC_FILE_FORMAT = (
+        10.0  # Near catch-all converters for mimetypes like text/*, etc.
+    )
+
+    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
        self._priority = priority

    def convert(
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
    NOTE: It is better to use the Bing API
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a Bing SERP
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):

    def __init__(
        self,
+        *,
+        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
+        super().__init__(priority=priority)
+
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -6,6 +6,7 @@ from ._base import (
    DocumentConverterResult,
 )

+from ._base import DocumentConverter
 from ._html_converter import HtmlConverter


@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a DOCX
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,5 +1,5 @@
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter


@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not an image
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
 class IpynbConverter(DocumentConverter):
    """Converts Jupyter Notebook (.ipynb) files to Markdown."""

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_media_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_media_converter.py
@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
    Abstract class for multi-modal media (e.g., images and audio)
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def _get_metadata(self, local_path, exiftool_path=None):
        if not exiftool_path:
            which_exiftool = shutil.which("exiftool")
@ -27,10 +32,10 @@ This warning will be removed in future releases.

            return None
        else:
-            try:
+            if True:
                result = subprocess.run(
                    [exiftool_path, "-json", local_path], capture_output=True, text=True
                ).stdout
                return json.loads(result)[0]
-            except Exception:
-                return None
+            # except Exception:
+            #    return None
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,6 +1,6 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings

@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a MP3
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
    - Email body content
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a PDF
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def _get_llm_description(
        self, llm_client, llm_model, image_blob, content_type, prompt=None
    ):
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@ -1,5 +1,5 @@
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter

 # Optional Transcription support
@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a WAV
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -2,7 +2,7 @@ from typing import Union

 import pandas as pd

-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter


@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a XLSX
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -19,6 +19,11 @@ except ModuleNotFoundError:
 class YouTubeConverter(DocumentConverter):
    """Handle YouTube specially, focusing on the video title, description, and transcript."""

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
    - Cleans up temporary files after processing
    """

+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:

 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
-    # test_markitdown_remote()
-    # test_markitdown_local()
+    test_markitdown_remote()
+    test_markitdown_local()
    test_markitdown_exiftool()
-    # test_markitdown_deprecation()
    # test_markitdown_llm()
+    print("All tests passed!")