Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.

This commit is contained in:
Adam Fourney 2025-03-05 20:09:18 -08:00
parent 84f8198d8a
commit a7ae7c53d8
20 changed files with 82 additions and 129 deletions

View file

@ -3,7 +3,11 @@
# SPDX-License-Identifier: MIT
from .__about__ import __version__
from ._markitdown import MarkItDown
from ._markitdown import (
MarkItDown,
PRIORITY_SPECIFIC_FILE_FORMAT,
PRIORITY_GENERIC_FILE_FORMAT,
)
from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo
from ._exceptions import (
@ -25,4 +29,6 @@ __all__ = [
"FileConversionException",
"UnsupportedFormatException",
"StreamInfo",
"PRIORITY_SPECIFIC_FILE_FORMAT",
"PRIORITY_GENERIC_FILE_FORMAT",
]

View file

@ -45,38 +45,6 @@ class DocumentConverterResult:
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
"""
Initialize the DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
with lower values being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. A converter's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._priority = priority
def accepts(
self,
file_stream: BinaryIO,
@ -138,12 +106,3 @@ class DocumentConverter:
- MissingDependencyException: If the converter requires a dependency that is not installed.
"""
raise NotImplementedError("Subclasses must implement this method")
@property
def priority(self) -> float:
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
return self._priority
@priority.setter
def priority(self, value: float):
self._priority = value

View file

@ -7,6 +7,7 @@ import tempfile
import warnings
import traceback
import io
from dataclasses import dataclass
from importlib.metadata import entry_points
from typing import Any, List, Optional, Union, BinaryIO
from pathlib import Path
@ -47,8 +48,15 @@ from ._exceptions import (
FailedConversionAttempt,
)
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
_plugins: List[Any] = []
@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]:
return _plugins
@dataclass(kw_only=True, frozen=True)
class ConverterRegistration:
"""A registration of a converter with its priority and other metadata."""
converter: DocumentConverter
priority: float
class MarkItDown:
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
This reader will convert common file-types or webpages to Markdown."""
@ -100,7 +116,7 @@ class MarkItDown:
self._style_map: Union[str | None] = None
# Register the converters
self._converters: List[DocumentConverter] = []
self._converters: List[ConverterRegistration] = []
if (
enable_builtins is None or enable_builtins
@ -128,9 +144,15 @@ class MarkItDown:
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_converter(PlainTextConverter())
self.register_converter(ZipConverter(markitdown=self))
self.register_converter(HtmlConverter())
self.register_converter(
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(RssConverter())
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
@ -418,13 +440,14 @@ class MarkItDown:
# Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls.
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
# Remember the initial stream position so that we can return to it
cur_pos = file_stream.tell()
for stream_info in stream_info_guesses + [StreamInfo()]:
for converter in sorted_converters:
for converter_registration in sorted_registrations:
converter = converter_registration.converter
# Sanity check -- make sure the cur_pos is still the same
assert (
cur_pos == file_stream.tell()
@ -506,6 +529,34 @@ class MarkItDown:
)
self.register_converter(converter)
def register_converter(self, converter: DocumentConverter) -> None:
"""Register a page text converter."""
self._converters.insert(0, converter)
def register_converter(
self,
converter: DocumentConverter,
*,
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
) -> None:
"""
Register a DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. The registration's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._converters.insert(
0, ConverterRegistration(converter=converter, priority=priority)
)

View file

@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter):
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter):
NOTE: It is better to use the Bing API
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
def __init__(
self,
*,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
super().__init__(priority=priority)
super().__init__()
# Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated

View file

@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(

View file

@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter):
- Email body content
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(

View file

@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter):
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(

View file

@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [
class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def accepts(
self,
file_stream: BinaryIO,

View file

@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter):
def __init__(
self,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
*,
markitdown: "MarkItDown",
):
super().__init__(priority=priority)
super().__init__()
self._markitdown = markitdown
def accepts(

View file

@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None:
finally:
warnings.resetwarnings()
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")
assert which_exiftool is not None
# Test explicitly setting the location of exiftool
markitdown = MarkItDown(exiftool_path=which_exiftool)
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL: