From 540410e5c86a287f8d0a94a1494ece063ddce5c0 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 11 Feb 2025 12:31:17 -0800 Subject: [PATCH] Promote discussion of converter priority to a docstring. --- .../markitdown/src/markitdown/_markitdown.py | 18 --------------- .../src/markitdown/converters/_base.py | 22 +++++++++++++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 36c8afb..8669ad5 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -99,23 +99,6 @@ class MarkItDown: # Register the converters self._page_converters: List[DocumentConverter] = [] - # Note: We have tight control over the order of built-in converters, but - # plugins can register converters in any order. A converter's .priority - # reasserts some control over the order of converters. - # - # Priorities work as follows. By default, most converters get priority - # DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception - # is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), - # with lower values being tried first (i.e., higher priority). - # - # Just prior to conversion, the converters are sorted by priority, using - # a stable sort. This means that converters with the same priority will - # remain in the same order, with the most recently registered converters - # appearing first. - # - # Plugins can register converters with any priority, to appear before or - # after the built-ins. For example, a plugin with priority 9 will run - # before the PlainTextConverter, but after the built-in converters. if ( enable_builtins is None or enable_builtins ): # Default to True when not specified @@ -128,7 +111,6 @@ class MarkItDown: """ Enable and register built-in converters. Built-in converters are enabled by default. - This method should only be called once, if built-ins were initially disabled. """ if not self._builtins_enabled: # TODO: Move these into converter constructors diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py index 0a768dc..6df37f6 100644 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ b/packages/markitdown/src/markitdown/converters/_base.py @@ -21,6 +21,28 @@ class DocumentConverter: ) def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): + """ + Initialize the DocumentConverter with a given priority. + + Priorities work as follows: By default, most converters get priority + DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), + with lower values being tried first (i.e., higher priority). + + Just prior to conversion, the converters are sorted by priority, using + a stable sort. This means that converters with the same priority will + remain in the same order, with the most recently registered converters + appearing first. + + We have tight control over the order of built-in converters, but + plugins can register converters in any order. A converter's priority + field reasserts some control over the order of converters. + This method should only be called once, if built-ins were initially disabled. + + Plugins can register converters with any priority, to appear before or + after the built-ins. For example, a plugin with priority 9 will run + before the PlainTextConverter, but after the built-in converters. + """ self._priority = priority def convert(