Added priority argument to all converter constructors.

This commit is contained in:
Adam Fourney 2025-02-11 10:13:36 -08:00
parent 4b62506451
commit 4298cfad8d
21 changed files with 128 additions and 15 deletions

View file

@ -47,10 +47,6 @@ from ._exceptions import (
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
PRIORITY_GENERIC_FILE_FORMAT = 10.0
_plugins: Union[None | List[Any]] = None
@ -103,6 +99,23 @@ class MarkItDown:
# Register the converters
self._page_converters: List[DocumentConverter] = []
# Note: We have tight control over the order of built-in converters, but
# plugins can register converters in any order. A converter's .priority
# reasserts some control over the order of converters.
#
# Priorities work as follows. By default, most converters get priority
# DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
# is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
# with lower values being tried first (i.e., higher priority).
#
# Just prior to conversion, the converters are sorted by priority, using
# a stable sort. This means that converters with the same priority will
# remain in the same order, with the most recently registered converters
# appearing first.
#
# Plugins can register converters with any priority, to appear before or
# after the built-ins. For example, a plugin with priority 9 will run
# before the PlainTextConverter, but after the built-in converters.
if (
enable_builtins is None or enable_builtins
): # Default to True when not specified
@ -123,6 +136,8 @@ class MarkItDown:
self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations

View file

@ -12,7 +12,15 @@ class DocumentConverterResult:
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""
def __init__(self, priority: float = 0.0):
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
self._priority = priority
def convert(

View file

@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
NOTE: It is better to use the Bing API
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP
extension = kwargs.get("file_extension", "")

View file

@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
def __init__(
self,
*,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
super().__init__(priority=priority)
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(

View file

@ -6,6 +6,7 @@ from ._base import (
DocumentConverterResult,
)
from ._base import DocumentConverter
from ._html_converter import HtmlConverter
@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")

View file

@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")

View file

@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
@ -27,10 +32,10 @@ This warning will be removed in future releases.
return None
else:
try:
if True:
result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
return None
# except Exception:
# return None

View file

@ -1,6 +1,6 @@
import tempfile
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings
@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")

View file

@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
- Email body content
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")

View file

@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):

View file

@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:

View file

@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
# Optional Transcription support
@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV
extension = kwargs.get("file_extension", "")

View file

@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -2,7 +2,7 @@ from typing import Union
import pandas as pd
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")

View file

@ -19,6 +19,11 @@ except ModuleNotFoundError:
class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
- Cleans up temporary files after processing
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View file

@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm()
print("All tests passed!")