Updated sample plugin to new Converter interface.

This commit is contained in:
Adam Fourney 2025-03-05 11:30:48 -08:00
parent b3d6009eb8
commit 36a49806b5
3 changed files with 61 additions and 20 deletions

View file

@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a2" __version__ = "0.0.1a3"

View file

@ -1,12 +1,26 @@
from typing import Union import locale
from typing import BinaryIO, Any
from striprtf.striprtf import rtf_to_text from striprtf.striprtf import rtf_to_text
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult from markitdown import (
MarkItDown,
DocumentConverter,
DocumentConverterResult,
StreamInfo,
)
__plugin_interface_version__ = ( __plugin_interface_version__ = (
1 # The version of the plugin interface that this plugin uses 1 # The version of the plugin interface that this plugin uses
) )
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/rtf",
"application/rtf",
]
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
def register_converters(markitdown: MarkItDown, **kwargs): def register_converters(markitdown: MarkItDown, **kwargs):
""" """
@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
Converts an RTF file to in the simplest possible way. Converts an RTF file to in the simplest possible way.
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def __init__(
# Bail if not a RTF self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
extension = kwargs.get("file_extension", "") ):
if extension.lower() != ".rtf": super().__init__(priority=priority)
return None
# Read the RTF file def accepts(
with open(local_path, "r") as f: self,
rtf = f.read() file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Read the file stream into an str using hte provided charset encoding, or using the system default
encoding = stream_info.charset or locale.getpreferredencoding()
stream_data = file_stream.read().decode(encoding)
# Return the result # Return the result
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=rtf_to_text(rtf), markdown=rtf_to_text(stream_data),
) )

View file

@ -2,7 +2,7 @@
import os import os
import pytest import pytest
from markitdown import MarkItDown from markitdown import MarkItDown, StreamInfo
from markitdown_sample_plugin import RtfConverter from markitdown_sample_plugin import RtfConverter
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@ -15,9 +15,13 @@ RTF_TEST_STRINGS = {
def test_converter() -> None: def test_converter() -> None:
"""Tests the RTF converter dirctly.""" """Tests the RTF converter dirctly."""
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
converter = RtfConverter() converter = RtfConverter()
result = converter.convert( result = converter.convert(
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf" file_stream=file_stream,
stream_info=StreamInfo(
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
),
) )
for test_string in RTF_TEST_STRINGS: for test_string in RTF_TEST_STRINGS:
@ -26,7 +30,7 @@ def test_converter() -> None:
def test_markitdown() -> None: def test_markitdown() -> None:
"""Tests that MarkItDown correctly loads the plugin.""" """Tests that MarkItDown correctly loads the plugin."""
md = MarkItDown() md = MarkItDown(enable_plugins=True)
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf")) result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
for test_string in RTF_TEST_STRINGS: for test_string in RTF_TEST_STRINGS: