From a7ae7c53d890c4ea2fc17192bfd6af562ff38b54 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Wed, 5 Mar 2025 20:09:18 -0800 Subject: [PATCH] Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple. --- .../markitdown/src/markitdown/__init__.py | 8 +- .../src/markitdown/_base_converter.py | 41 ----------- .../markitdown/src/markitdown/_markitdown.py | 73 ++++++++++++++++--- .../markitdown/converters/_audio_converter.py | 5 -- .../converters/_bing_serp_converter.py | 5 -- .../converters/_doc_intel_converter.py | 3 +- .../markitdown/converters/_docx_converter.py | 6 +- .../markitdown/converters/_html_converter.py | 5 -- .../markitdown/converters/_image_converter.py | 5 -- .../markitdown/converters/_ipynb_converter.py | 5 -- .../converters/_outlook_msg_converter.py | 5 -- .../markitdown/converters/_pdf_converter.py | 5 -- .../converters/_plain_text_converter.py | 5 -- .../markitdown/converters/_pptx_converter.py | 6 +- .../markitdown/converters/_rss_converter.py | 5 -- .../converters/_wikipedia_converter.py | 5 -- .../markitdown/converters/_xlsx_converter.py | 12 +-- .../converters/_youtube_converter.py | 5 -- .../markitdown/converters/_zip_converter.py | 3 +- packages/markitdown/tests/test_markitdown.py | 4 +- 20 files changed, 82 insertions(+), 129 deletions(-) diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index bb6fcdb..af356dd 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -3,7 +3,11 @@ # SPDX-License-Identifier: MIT from .__about__ import __version__ -from ._markitdown import MarkItDown +from ._markitdown import ( + MarkItDown, + PRIORITY_SPECIFIC_FILE_FORMAT, + PRIORITY_GENERIC_FILE_FORMAT, +) from ._base_converter import DocumentConverterResult, DocumentConverter from ._stream_info import StreamInfo from ._exceptions import ( @@ -25,4 +29,6 @@ __all__ = [ "FileConversionException", "UnsupportedFormatException", "StreamInfo", + "PRIORITY_SPECIFIC_FILE_FORMAT", + "PRIORITY_GENERIC_FILE_FORMAT", ] diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index f4fb3a1..2f0ca9d 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -45,38 +45,6 @@ class DocumentConverterResult: class DocumentConverter: """Abstract superclass of all DocumentConverters.""" - # Lower priority values are tried first. - PRIORITY_SPECIFIC_FILE_FORMAT = ( - 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia - ) - PRIORITY_GENERIC_FILE_FORMAT = ( - 10.0 # Near catch-all converters for mimetypes like text/*, etc. - ) - - def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): - """ - Initialize the DocumentConverter with a given priority. - - Priorities work as follows: By default, most converters get priority - DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception - is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), - with lower values being tried first (i.e., higher priority). - - Just prior to conversion, the converters are sorted by priority, using - a stable sort. This means that converters with the same priority will - remain in the same order, with the most recently registered converters - appearing first. - - We have tight control over the order of built-in converters, but - plugins can register converters in any order. A converter's priority - field reasserts some control over the order of converters. - - Plugins can register converters with any priority, to appear before or - after the built-ins. For example, a plugin with priority 9 will run - before the PlainTextConverter, but after the built-in converters. - """ - self._priority = priority - def accepts( self, file_stream: BinaryIO, @@ -138,12 +106,3 @@ class DocumentConverter: - MissingDependencyException: If the converter requires a dependency that is not installed. """ raise NotImplementedError("Subclasses must implement this method") - - @property - def priority(self) -> float: - """Priority of the converter in markitdown's converter list. Higher priority values are tried first.""" - return self._priority - - @priority.setter - def priority(self, value: float): - self._priority = value diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index a51f227..6086eb9 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -7,6 +7,7 @@ import tempfile import warnings import traceback import io +from dataclasses import dataclass from importlib.metadata import entry_points from typing import Any, List, Optional, Union, BinaryIO from pathlib import Path @@ -47,8 +48,15 @@ from ._exceptions import ( FailedConversionAttempt, ) -# Override mimetype for csv to fix issue on windows -mimetypes.add_type("text/csv", ".csv") + +# Lower priority values are tried first. +PRIORITY_SPECIFIC_FILE_FORMAT = ( + 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia +) +PRIORITY_GENERIC_FILE_FORMAT = ( + 10.0 # Near catch-all converters for mimetypes like text/*, etc. +) + _plugins: List[Any] = [] @@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]: return _plugins +@dataclass(kw_only=True, frozen=True) +class ConverterRegistration: + """A registration of a converter with its priority and other metadata.""" + + converter: DocumentConverter + priority: float + + class MarkItDown: """(In preview) An extremely simple text-based document reader, suitable for LLM use. This reader will convert common file-types or webpages to Markdown.""" @@ -100,7 +116,7 @@ class MarkItDown: self._style_map: Union[str | None] = None # Register the converters - self._converters: List[DocumentConverter] = [] + self._converters: List[ConverterRegistration] = [] if ( enable_builtins is None or enable_builtins @@ -128,9 +144,15 @@ class MarkItDown: # Register converters for successful browsing operations # Later registrations are tried first / take higher priority than earlier registrations # To this end, the most specific converters should appear below the most generic converters - self.register_converter(PlainTextConverter()) - self.register_converter(ZipConverter(markitdown=self)) - self.register_converter(HtmlConverter()) + self.register_converter( + PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT + ) + self.register_converter( + ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT + ) + self.register_converter( + HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT + ) self.register_converter(RssConverter()) self.register_converter(WikipediaConverter()) self.register_converter(YouTubeConverter()) @@ -418,13 +440,14 @@ class MarkItDown: # Create a copy of the page_converters list, sorted by priority. # We do this with each call to _convert because the priority of converters may change between calls. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. - sorted_converters = sorted(self._converters, key=lambda x: x.priority) + sorted_registrations = sorted(self._converters, key=lambda x: x.priority) # Remember the initial stream position so that we can return to it cur_pos = file_stream.tell() for stream_info in stream_info_guesses + [StreamInfo()]: - for converter in sorted_converters: + for converter_registration in sorted_registrations: + converter = converter_registration.converter # Sanity check -- make sure the cur_pos is still the same assert ( cur_pos == file_stream.tell() @@ -506,6 +529,34 @@ class MarkItDown: ) self.register_converter(converter) - def register_converter(self, converter: DocumentConverter) -> None: - """Register a page text converter.""" - self._converters.insert(0, converter) + def register_converter( + self, + converter: DocumentConverter, + *, + priority: float = PRIORITY_SPECIFIC_FILE_FORMAT, + ) -> None: + """ + Register a DocumentConverter with a given priority. + + Priorities work as follows: By default, most converters get priority + DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + is the PlainTextConverter, HtmlConverter, and ZipConverter, which get + priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values + being tried first (i.e., higher priority). + + Just prior to conversion, the converters are sorted by priority, using + a stable sort. This means that converters with the same priority will + remain in the same order, with the most recently registered converters + appearing first. + + We have tight control over the order of built-in converters, but + plugins can register converters in any order. The registration's priority + field reasserts some control over the order of converters. + + Plugins can register converters with any priority, to appear before or + after the built-ins. For example, a plugin with priority 9 will run + before the PlainTextConverter, but after the built-in converters. + """ + self._converters.insert( + 0, ConverterRegistration(converter=converter, priority=priority) + ) diff --git a/packages/markitdown/src/markitdown/converters/_audio_converter.py b/packages/markitdown/src/markitdown/converters/_audio_converter.py index d502deb..845ad5d 100644 --- a/packages/markitdown/src/markitdown/converters/_audio_converter.py +++ b/packages/markitdown/src/markitdown/converters/_audio_converter.py @@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter): Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 2e9913c..7dd9e24 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter): NOTE: It is better to use the Bing API """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 00ab0fc..2f116d0 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter): def __init__( self, *, - priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, endpoint: str, api_version: str = "2024-07-31-preview", ): - super().__init__(priority=priority) + super().__init__() # Raise an error if the dependencies are not available. # This is different than other converters since this one isn't even instantiated diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index a5090ac..c568acb 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() self._html_converter = HtmlConverter() def accepts( diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 7d0c916..8a8203d 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [ class HtmlConverter(DocumentConverter): """Anything with content type text/html""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index e03dfe8..dd8fbac 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter): Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured). """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index 490e4e1..f8ba193 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"] class IpynbConverter(DocumentConverter): """Converts Jupyter Notebook (.ipynb) files to Markdown.""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index cef3dc7..8a61b0c 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter): - Email body content """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 445dba3..4586ef1 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter): Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 92da511..4a21d3a 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [ class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index e51739e..bea1226 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() self._html_converter = HtmlConverter() def accepts( diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 3074c6c..dbafc1b 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [ class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index c9176f6..5b054af 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [ class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index f11af31..3d0e1ab 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() self._html_converter = HtmlConverter() def accepts( @@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter): Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) + def __init__(self): + super().__init__() self._html_converter = HtmlConverter() def accepts( diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index 2efc6ea..5a158d5 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [ class YouTubeConverter(DocumentConverter): """Handle YouTube specially, focusing on the video title, description, and transcript.""" - def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT - ): - super().__init__(priority=priority) - def accepts( self, file_stream: BinaryIO, diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index c60d94a..cb1a7e6 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter): def __init__( self, - priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, *, markitdown: "MarkItDown", ): - super().__init__(priority=priority) + super().__init__() self._markitdown = markitdown def accepts( diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index 8905253..8c34da0 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None: finally: warnings.resetwarnings() - # Test explicitly setting the location of exiftool which_exiftool = shutil.which("exiftool") + assert which_exiftool is not None + + # Test explicitly setting the location of exiftool markitdown = MarkItDown(exiftool_path=which_exiftool) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: