Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.
This commit is contained in:
parent
84f8198d8a
commit
a7ae7c53d8
20 changed files with 82 additions and 129 deletions
|
|
@ -3,7 +3,11 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown
|
||||
from ._markitdown import (
|
||||
MarkItDown,
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
PRIORITY_GENERIC_FILE_FORMAT,
|
||||
)
|
||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||
from ._stream_info import StreamInfo
|
||||
from ._exceptions import (
|
||||
|
|
@ -25,4 +29,6 @@ __all__ = [
|
|||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
"StreamInfo",
|
||||
"PRIORITY_SPECIFIC_FILE_FORMAT",
|
||||
"PRIORITY_GENERIC_FILE_FORMAT",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -45,38 +45,6 @@ class DocumentConverterResult:
|
|||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
||||
"""
|
||||
Initialize the DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
||||
with lower values being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. A converter's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._priority = priority
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -138,12 +106,3 @@ class DocumentConverter:
|
|||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
|
||||
@property
|
||||
def priority(self) -> float:
|
||||
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
||||
return self._priority
|
||||
|
||||
@priority.setter
|
||||
def priority(self, value: float):
|
||||
self._priority = value
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import tempfile
|
|||
import warnings
|
||||
import traceback
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Any, List, Optional, Union, BinaryIO
|
||||
from pathlib import Path
|
||||
|
|
@ -47,8 +48,15 @@ from ._exceptions import (
|
|||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
# Override mimetype for csv to fix issue on windows
|
||||
mimetypes.add_type("text/csv", ".csv")
|
||||
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
|
||||
_plugins: List[Any] = []
|
||||
|
||||
|
|
@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]:
|
|||
return _plugins
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class ConverterRegistration:
|
||||
"""A registration of a converter with its priority and other metadata."""
|
||||
|
||||
converter: DocumentConverter
|
||||
priority: float
|
||||
|
||||
|
||||
class MarkItDown:
|
||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||
This reader will convert common file-types or webpages to Markdown."""
|
||||
|
|
@ -100,7 +116,7 @@ class MarkItDown:
|
|||
self._style_map: Union[str | None] = None
|
||||
|
||||
# Register the converters
|
||||
self._converters: List[DocumentConverter] = []
|
||||
self._converters: List[ConverterRegistration] = []
|
||||
|
||||
if (
|
||||
enable_builtins is None or enable_builtins
|
||||
|
|
@ -128,9 +144,15 @@ class MarkItDown:
|
|||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
self.register_converter(PlainTextConverter())
|
||||
self.register_converter(ZipConverter(markitdown=self))
|
||||
self.register_converter(HtmlConverter())
|
||||
self.register_converter(
|
||||
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(RssConverter())
|
||||
self.register_converter(WikipediaConverter())
|
||||
self.register_converter(YouTubeConverter())
|
||||
|
|
@ -418,13 +440,14 @@ class MarkItDown:
|
|||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
|
||||
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||
|
||||
# Remember the initial stream position so that we can return to it
|
||||
cur_pos = file_stream.tell()
|
||||
|
||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||
for converter in sorted_converters:
|
||||
for converter_registration in sorted_registrations:
|
||||
converter = converter_registration.converter
|
||||
# Sanity check -- make sure the cur_pos is still the same
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
|
|
@ -506,6 +529,34 @@ class MarkItDown:
|
|||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(self, converter: DocumentConverter) -> None:
|
||||
"""Register a page text converter."""
|
||||
self._converters.insert(0, converter)
|
||||
def register_converter(
|
||||
self,
|
||||
converter: DocumentConverter,
|
||||
*,
|
||||
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
) -> None:
|
||||
"""
|
||||
Register a DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
|
||||
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
|
||||
being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. The registration's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._converters.insert(
|
||||
0, ConverterRegistration(converter=converter, priority=priority)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter):
|
|||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter):
|
|||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||
def __init__(
|
||||
self,
|
||||
*,
|
||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
endpoint: str,
|
||||
api_version: str = "2024-07-31-preview",
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
super().__init__()
|
||||
|
||||
# Raise an error if the dependencies are not available.
|
||||
# This is different than other converters since this one isn't even instantiated
|
||||
|
|
|
|||
|
|
@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter):
|
|||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
|
|
|
|||
|
|
@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
|||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter):
|
|||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
|||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter):
|
|||
- Email body content
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter):
|
|||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [
|
|||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter):
|
|||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
|
|
|
|||
|
|
@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [
|
|||
class RssConverter(DocumentConverter):
|
||||
"""Convert RSS / Atom type to markdown"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
|||
class WikipediaConverter(DocumentConverter):
|
||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter):
|
|||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
|
|
@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter):
|
|||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
|
|
|
|||
|
|
@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
|||
class YouTubeConverter(DocumentConverter):
|
||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
|
|||
|
|
@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
*,
|
||||
markitdown: "MarkItDown",
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
super().__init__()
|
||||
self._markitdown = markitdown
|
||||
|
||||
def accepts(
|
||||
|
|
|
|||
|
|
@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None:
|
|||
finally:
|
||||
warnings.resetwarnings()
|
||||
|
||||
# Test explicitly setting the location of exiftool
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
assert which_exiftool is not None
|
||||
|
||||
# Test explicitly setting the location of exiftool
|
||||
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
for key in JPG_TEST_EXIFTOOL:
|
||||
|
|
|
|||
Loading…
Reference in a new issue