From 36a49806b5e847e58381ef4300f3225f47a64f34 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Wed, 5 Mar 2025 11:30:48 -0800 Subject: [PATCH] Updated sample plugin to new Converter interface. --- .../src/markitdown_sample_plugin/__about__.py | 2 +- .../src/markitdown_sample_plugin/_plugin.py | 59 +++++++++++++++---- .../tests/test_sample_plugin.py | 20 ++++--- 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py index fa67ccb..a365900 100644 --- a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py +++ b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.0.1a2" +__version__ = "0.0.1a3" diff --git a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py index 98e660e..8071972 100644 --- a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py +++ b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py @@ -1,12 +1,26 @@ -from typing import Union +import locale +from typing import BinaryIO, Any from striprtf.striprtf import rtf_to_text -from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult +from markitdown import ( + MarkItDown, + DocumentConverter, + DocumentConverterResult, + StreamInfo, +) + __plugin_interface_version__ = ( 1 # The version of the plugin interface that this plugin uses ) +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/rtf", + "application/rtf", +] + +ACCEPTED_FILE_EXTENSIONS = [".rtf"] + def register_converters(markitdown: MarkItDown, **kwargs): """ @@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter): Converts an RTF file to in the simplest possible way. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a RTF - extension = kwargs.get("file_extension", "") - if extension.lower() != ".rtf": - return None + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) - # Read the RTF file - with open(local_path, "r") as f: - rtf = f.read() + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Read the file stream into an str using hte provided charset encoding, or using the system default + encoding = stream_info.charset or locale.getpreferredencoding() + stream_data = file_stream.read().decode(encoding) # Return the result return DocumentConverterResult( title=None, - text_content=rtf_to_text(rtf), + markdown=rtf_to_text(stream_data), ) diff --git a/packages/markitdown-sample-plugin/tests/test_sample_plugin.py b/packages/markitdown-sample-plugin/tests/test_sample_plugin.py index 49d54aa..6d0102d 100644 --- a/packages/markitdown-sample-plugin/tests/test_sample_plugin.py +++ b/packages/markitdown-sample-plugin/tests/test_sample_plugin.py @@ -2,7 +2,7 @@ import os import pytest -from markitdown import MarkItDown +from markitdown import MarkItDown, StreamInfo from markitdown_sample_plugin import RtfConverter TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") @@ -15,18 +15,22 @@ RTF_TEST_STRINGS = { def test_converter() -> None: """Tests the RTF converter dirctly.""" - converter = RtfConverter() - result = converter.convert( - os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf" - ) + with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream: + converter = RtfConverter() + result = converter.convert( + file_stream=file_stream, + stream_info=StreamInfo( + mimetype="text/rtf", extension=".rtf", filename="test.rtf" + ), + ) - for test_string in RTF_TEST_STRINGS: - assert test_string in result.text_content + for test_string in RTF_TEST_STRINGS: + assert test_string in result.text_content def test_markitdown() -> None: """Tests that MarkItDown correctly loads the plugin.""" - md = MarkItDown() + md = MarkItDown(enable_plugins=True) result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf")) for test_string in RTF_TEST_STRINGS: