Updated sample plugin to new Converter interface.
This commit is contained in:
parent
b3d6009eb8
commit
36a49806b5
3 changed files with 61 additions and 20 deletions
|
|
@ -1,4 +1,4 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.0.1a2"
|
||||
__version__ = "0.0.1a3"
|
||||
|
|
|
|||
|
|
@ -1,12 +1,26 @@
|
|||
from typing import Union
|
||||
import locale
|
||||
from typing import BinaryIO, Any
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
StreamInfo,
|
||||
)
|
||||
|
||||
|
||||
__plugin_interface_version__ = (
|
||||
1 # The version of the plugin interface that this plugin uses
|
||||
)
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/rtf",
|
||||
"application/rtf",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
"""
|
||||
|
|
@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
|
|||
Converts an RTF file to in the simplest possible way.
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a RTF
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".rtf":
|
||||
return None
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
# Read the RTF file
|
||||
with open(local_path, "r") as f:
|
||||
rtf = f.read()
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file stream into an str using hte provided charset encoding, or using the system default
|
||||
encoding = stream_info.charset or locale.getpreferredencoding()
|
||||
stream_data = file_stream.read().decode(encoding)
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=rtf_to_text(rtf),
|
||||
markdown=rtf_to_text(stream_data),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
import os
|
||||
import pytest
|
||||
|
||||
from markitdown import MarkItDown
|
||||
from markitdown import MarkItDown, StreamInfo
|
||||
from markitdown_sample_plugin import RtfConverter
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
|
@ -15,18 +15,22 @@ RTF_TEST_STRINGS = {
|
|||
|
||||
def test_converter() -> None:
|
||||
"""Tests the RTF converter dirctly."""
|
||||
converter = RtfConverter()
|
||||
result = converter.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
|
||||
)
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
|
||||
converter = RtfConverter()
|
||||
result = converter.convert(
|
||||
file_stream=file_stream,
|
||||
stream_info=StreamInfo(
|
||||
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
|
||||
),
|
||||
)
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
|
||||
def test_markitdown() -> None:
|
||||
"""Tests that MarkItDown correctly loads the plugin."""
|
||||
md = MarkItDown()
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
|
|
|
|||
Loading…
Reference in a new issue