Updated the plugin interface.

This commit is contained in:
Adam Fourney 2025-02-10 11:05:20 -08:00
parent e54f706ae3
commit f188abe9d6
5 changed files with 105 additions and 41 deletions

View file

@ -5,15 +5,14 @@
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
This project shows how to create a sample plugin for MarkItDown. The two most important parts are as follows:
This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows:
First, implement your custom DocumentConverter:
FNext, implement your custom DocumentConverter:
```python
from typing import Union
from markitdown import DocumentConverter, DocumentConverterResult
class RtfConverter(DocumentConverter):
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an RTF file
@ -30,17 +29,34 @@ class RtfConverter(DocumentConverter):
)
```
Second, create an entrypoint in the `pyproject.toml` file:
Next, make sure your package implements and exports the following:
```toml
[project.entry-points."markitdown.plugin.converters"]
rtf = "markitdown_sample_plugin:RtfConverter"
```python
# The version of the plugin interface that this plugin uses.
# The only supported version is 1 for now.
__plugin_interface_version__ = 1
# The main entrypoint for the plugin. This is called each time MarkItDown instances are created.
def register_converters(markitdown: MarkItDown, **kwargs):
"""
Called during construction of MarkItDown instances to register converters provided by plugins.
"""
# Simply create and attach an RtfConverter instance
markitdown.register_converter(RtfConverter())
```
Here, the value of `rtf` can be any key, but should ideally be the name of the plugin, or the extension it supports. The value is the fully qualified name of the class that implements the `DocumentConverter` interface.
Finally, create an entrypoint in the `pyproject.toml` file:
Once the plugin package is installed (e.g., `pip install -e .`), MarkItDown will automatically discover the plugin and register it for use.
```toml
[project.entry-points."markitdown.plugin"]
sample_plugin = "markitdown_sample_plugin"
```
Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin.
Once the plugin package is installed (e.g., `pip install -e .`), MarkItDown will automatically discover register it for use.
## Trademarks

View file

@ -37,8 +37,8 @@ Source = "https://github.com/microsoft/markitdown"
path = "src/markitdown_sample_plugin/__about__.py"
# IMPORTANT: MarkItDown will look for this entry point to find the plugin.
[project.entry-points."markitdown.plugin.converters"]
rtf = "markitdown_sample_plugin:RtfConverter"
[project.entry-points."markitdown.plugin"]
sample_plugin = "markitdown_sample_plugin"
[tool.hatch.envs.types]
extra-dependencies = [

View file

@ -2,10 +2,12 @@
#
# SPDX-License-Identifier: MIT
from ._rtf_converter import RtfConverter
from ._plugin import __plugin_interface_version__, register_converters, RtfConverter
from .__about__ import __version__
__all__ = [
"__version__",
"__plugin_interface_version__",
"register_converters",
"RtfConverter",
]

View file

@ -1,7 +1,20 @@
from typing import Union
from striprtf.striprtf import rtf_to_text
from markitdown import DocumentConverter, DocumentConverterResult
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
__plugin_interface_version__ = (
1 # The version of the plugin interface that this plugin uses
)
def register_converters(markitdown: MarkItDown, **kwargs):
"""
Called during construction of MarkItDown instances to register converters provided by plugins.
"""
# Simply create and attach an RtfConverter instance
markitdown.register_converter(RtfConverter())
class RtfConverter(DocumentConverter):

View file

@ -3,6 +3,8 @@ import mimetypes
import os
import re
import tempfile
import warnings
import traceback
from importlib.metadata import entry_points
from typing import Any, List, Optional, Union
from pathlib import Path
@ -49,7 +51,30 @@ from ._exceptions import (
mimetypes.add_type("text/csv", ".csv")
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
PRIORITY_GENERIC_FILE_FORMAT = -10.0
PRIORITY_GENERIC_FILE_FORMAT = 10.0
_plugins: Union[None | List[Any]] = None
def _load_plugins() -> Union[None | List[Any]]:
"""Lazy load plugins, exiting early if already loaded."""
global _plugins
# Skip if we've already loaded plugins
if _plugins is not None:
return _plugins
# Load plugins
_plugins = []
for entry_point in entry_points(group="markitdown.plugin"):
try:
_plugins.append(entry_point.load())
except Exception:
tb = traceback.format_exc()
warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
return _plugins
class MarkItDown:
@ -58,6 +83,8 @@ class MarkItDown:
def __init__(
self,
*,
load_plugins: bool = True,
requests_session: Optional[requests.Session] = None,
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
@ -115,40 +142,38 @@ class MarkItDown:
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter())
self.register_page_converter(ZipConverter())
self.register_page_converter(HtmlConverter())
self.register_page_converter(RssConverter())
self.register_page_converter(WikipediaConverter())
self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter())
self.register_page_converter(XlsxConverter())
self.register_page_converter(XlsConverter())
self.register_page_converter(PptxConverter())
self.register_page_converter(WavConverter())
self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter())
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter())
self.register_page_converter(OutlookMsgConverter())
self.register_converter(PlainTextConverter())
self.register_converter(ZipConverter())
self.register_converter(HtmlConverter())
self.register_converter(RssConverter())
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
self.register_converter(WavConverter())
self.register_converter(Mp3Converter())
self.register_converter(ImageConverter())
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:
self.register_page_converter(
self.register_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
# Load plugins
for entry_point in entry_points(group="markitdown.plugin.converters"):
try:
plugin = entry_point.load()
self.register_page_converter(plugin())
# print(f"Loaded plugin {entry_point.value} as {entry_point.name}")
except ConverterPrerequisiteException as e:
# print(f"Skipping plugin {entry_point.name} because of missing prerequisite: {e}")
pass
if load_plugins:
for plugin in _load_plugins():
try:
plugin.register_converters(self)
except Exception:
tb = traceback.format_exc()
warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any
@ -404,5 +429,13 @@ class MarkItDown:
return []
def register_page_converter(self, converter: DocumentConverter) -> None:
"""DEPRECATED: User register_converter instead."""
warn(
"register_page_converter is deprecated. Use register_converter instead.",
DeprecationWarning,
)
self.register_converter(converter)
def register_converter(self, converter: DocumentConverter) -> None:
"""Register a page text converter."""
self._page_converters.insert(0, converter)