Updated the plugin interface.
This commit is contained in:
parent
e54f706ae3
commit
f188abe9d6
5 changed files with 105 additions and 41 deletions
|
|
@ -5,15 +5,14 @@
|
|||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
|
||||
This project shows how to create a sample plugin for MarkItDown. The two most important parts are as follows:
|
||||
This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows:
|
||||
|
||||
First, implement your custom DocumentConverter:
|
||||
FNext, implement your custom DocumentConverter:
|
||||
|
||||
```python
|
||||
from typing import Union
|
||||
from markitdown import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not an RTF file
|
||||
|
|
@ -30,17 +29,34 @@ class RtfConverter(DocumentConverter):
|
|||
)
|
||||
```
|
||||
|
||||
Second, create an entrypoint in the `pyproject.toml` file:
|
||||
Next, make sure your package implements and exports the following:
|
||||
|
||||
```toml
|
||||
[project.entry-points."markitdown.plugin.converters"]
|
||||
rtf = "markitdown_sample_plugin:RtfConverter"
|
||||
```python
|
||||
# The version of the plugin interface that this plugin uses.
|
||||
# The only supported version is 1 for now.
|
||||
__plugin_interface_version__ = 1
|
||||
|
||||
# The main entrypoint for the plugin. This is called each time MarkItDown instances are created.
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
"""
|
||||
Called during construction of MarkItDown instances to register converters provided by plugins.
|
||||
"""
|
||||
|
||||
# Simply create and attach an RtfConverter instance
|
||||
markitdown.register_converter(RtfConverter())
|
||||
```
|
||||
|
||||
Here, the value of `rtf` can be any key, but should ideally be the name of the plugin, or the extension it supports. The value is the fully qualified name of the class that implements the `DocumentConverter` interface.
|
||||
|
||||
Finally, create an entrypoint in the `pyproject.toml` file:
|
||||
|
||||
Once the plugin package is installed (e.g., `pip install -e .`), MarkItDown will automatically discover the plugin and register it for use.
|
||||
```toml
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
sample_plugin = "markitdown_sample_plugin"
|
||||
```
|
||||
|
||||
Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin.
|
||||
|
||||
Once the plugin package is installed (e.g., `pip install -e .`), MarkItDown will automatically discover register it for use.
|
||||
|
||||
|
||||
## Trademarks
|
||||
|
|
|
|||
|
|
@ -37,8 +37,8 @@ Source = "https://github.com/microsoft/markitdown"
|
|||
path = "src/markitdown_sample_plugin/__about__.py"
|
||||
|
||||
# IMPORTANT: MarkItDown will look for this entry point to find the plugin.
|
||||
[project.entry-points."markitdown.plugin.converters"]
|
||||
rtf = "markitdown_sample_plugin:RtfConverter"
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
sample_plugin = "markitdown_sample_plugin"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
|
|
|
|||
|
|
@ -2,10 +2,12 @@
|
|||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from ._rtf_converter import RtfConverter
|
||||
from ._plugin import __plugin_interface_version__, register_converters, RtfConverter
|
||||
from .__about__ import __version__
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__plugin_interface_version__",
|
||||
"register_converters",
|
||||
"RtfConverter",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,7 +1,20 @@
|
|||
from typing import Union
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
from markitdown import DocumentConverter, DocumentConverterResult
|
||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
|
||||
|
||||
__plugin_interface_version__ = (
|
||||
1 # The version of the plugin interface that this plugin uses
|
||||
)
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
"""
|
||||
Called during construction of MarkItDown instances to register converters provided by plugins.
|
||||
"""
|
||||
|
||||
# Simply create and attach an RtfConverter instance
|
||||
markitdown.register_converter(RtfConverter())
|
||||
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
|
|
@ -3,6 +3,8 @@ import mimetypes
|
|||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import warnings
|
||||
import traceback
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Any, List, Optional, Union
|
||||
from pathlib import Path
|
||||
|
|
@ -49,7 +51,30 @@ from ._exceptions import (
|
|||
mimetypes.add_type("text/csv", ".csv")
|
||||
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
|
||||
PRIORITY_GENERIC_FILE_FORMAT = -10.0
|
||||
PRIORITY_GENERIC_FILE_FORMAT = 10.0
|
||||
|
||||
|
||||
_plugins: Union[None | List[Any]] = None
|
||||
|
||||
|
||||
def _load_plugins() -> Union[None | List[Any]]:
|
||||
"""Lazy load plugins, exiting early if already loaded."""
|
||||
global _plugins
|
||||
|
||||
# Skip if we've already loaded plugins
|
||||
if _plugins is not None:
|
||||
return _plugins
|
||||
|
||||
# Load plugins
|
||||
_plugins = []
|
||||
for entry_point in entry_points(group="markitdown.plugin"):
|
||||
try:
|
||||
_plugins.append(entry_point.load())
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
|
||||
|
||||
return _plugins
|
||||
|
||||
|
||||
class MarkItDown:
|
||||
|
|
@ -58,6 +83,8 @@ class MarkItDown:
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
load_plugins: bool = True,
|
||||
requests_session: Optional[requests.Session] = None,
|
||||
llm_client: Optional[Any] = None,
|
||||
llm_model: Optional[str] = None,
|
||||
|
|
@ -115,40 +142,38 @@ class MarkItDown:
|
|||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
self.register_page_converter(PlainTextConverter())
|
||||
self.register_page_converter(ZipConverter())
|
||||
self.register_page_converter(HtmlConverter())
|
||||
self.register_page_converter(RssConverter())
|
||||
self.register_page_converter(WikipediaConverter())
|
||||
self.register_page_converter(YouTubeConverter())
|
||||
self.register_page_converter(BingSerpConverter())
|
||||
self.register_page_converter(DocxConverter())
|
||||
self.register_page_converter(XlsxConverter())
|
||||
self.register_page_converter(XlsConverter())
|
||||
self.register_page_converter(PptxConverter())
|
||||
self.register_page_converter(WavConverter())
|
||||
self.register_page_converter(Mp3Converter())
|
||||
self.register_page_converter(ImageConverter())
|
||||
self.register_page_converter(IpynbConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(OutlookMsgConverter())
|
||||
self.register_converter(PlainTextConverter())
|
||||
self.register_converter(ZipConverter())
|
||||
self.register_converter(HtmlConverter())
|
||||
self.register_converter(RssConverter())
|
||||
self.register_converter(WikipediaConverter())
|
||||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
self.register_converter(WavConverter())
|
||||
self.register_converter(Mp3Converter())
|
||||
self.register_converter(ImageConverter())
|
||||
self.register_converter(IpynbConverter())
|
||||
self.register_converter(PdfConverter())
|
||||
self.register_converter(OutlookMsgConverter())
|
||||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
if docintel_endpoint is not None:
|
||||
self.register_page_converter(
|
||||
self.register_converter(
|
||||
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
||||
)
|
||||
|
||||
# Load plugins
|
||||
for entry_point in entry_points(group="markitdown.plugin.converters"):
|
||||
try:
|
||||
plugin = entry_point.load()
|
||||
self.register_page_converter(plugin())
|
||||
# print(f"Loaded plugin {entry_point.value} as {entry_point.name}")
|
||||
|
||||
except ConverterPrerequisiteException as e:
|
||||
# print(f"Skipping plugin {entry_point.name} because of missing prerequisite: {e}")
|
||||
pass
|
||||
if load_plugins:
|
||||
for plugin in _load_plugins():
|
||||
try:
|
||||
plugin.register_converters(self)
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
|
|
@ -404,5 +429,13 @@ class MarkItDown:
|
|||
return []
|
||||
|
||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||
"""DEPRECATED: User register_converter instead."""
|
||||
warn(
|
||||
"register_page_converter is deprecated. Use register_converter instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(self, converter: DocumentConverter) -> None:
|
||||
"""Register a page text converter."""
|
||||
self._page_converters.insert(0, converter)
|
||||
|
|
|
|||
Loading…
Reference in a new issue