Updated sample plugin to new Converter interface.
This commit is contained in:
parent
b3d6009eb8
commit
36a49806b5
3 changed files with 61 additions and 20 deletions
|
|
@ -1,4 +1,4 @@
|
||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.0.1a2"
|
__version__ = "0.0.1a3"
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,26 @@
|
||||||
from typing import Union
|
import locale
|
||||||
|
from typing import BinaryIO, Any
|
||||||
from striprtf.striprtf import rtf_to_text
|
from striprtf.striprtf import rtf_to_text
|
||||||
|
|
||||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
|
from markitdown import (
|
||||||
|
MarkItDown,
|
||||||
|
DocumentConverter,
|
||||||
|
DocumentConverterResult,
|
||||||
|
StreamInfo,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__plugin_interface_version__ = (
|
__plugin_interface_version__ = (
|
||||||
1 # The version of the plugin interface that this plugin uses
|
1 # The version of the plugin interface that this plugin uses
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/rtf",
|
||||||
|
"application/rtf",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
|
||||||
|
|
||||||
|
|
||||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|
@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
|
||||||
Converts an RTF file to in the simplest possible way.
|
Converts an RTF file to in the simplest possible way.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def __init__(
|
||||||
# Bail if not a RTF
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
extension = kwargs.get("file_extension", "")
|
):
|
||||||
if extension.lower() != ".rtf":
|
super().__init__(priority=priority)
|
||||||
return None
|
|
||||||
|
|
||||||
# Read the RTF file
|
def accepts(
|
||||||
with open(local_path, "r") as f:
|
self,
|
||||||
rtf = f.read()
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Read the file stream into an str using hte provided charset encoding, or using the system default
|
||||||
|
encoding = stream_info.charset or locale.getpreferredencoding()
|
||||||
|
stream_data = file_stream.read().decode(encoding)
|
||||||
|
|
||||||
# Return the result
|
# Return the result
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=rtf_to_text(rtf),
|
markdown=rtf_to_text(stream_data),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown, StreamInfo
|
||||||
from markitdown_sample_plugin import RtfConverter
|
from markitdown_sample_plugin import RtfConverter
|
||||||
|
|
||||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||||
|
|
@ -15,18 +15,22 @@ RTF_TEST_STRINGS = {
|
||||||
|
|
||||||
def test_converter() -> None:
|
def test_converter() -> None:
|
||||||
"""Tests the RTF converter dirctly."""
|
"""Tests the RTF converter dirctly."""
|
||||||
converter = RtfConverter()
|
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
|
||||||
result = converter.convert(
|
converter = RtfConverter()
|
||||||
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
|
result = converter.convert(
|
||||||
)
|
file_stream=file_stream,
|
||||||
|
stream_info=StreamInfo(
|
||||||
|
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
for test_string in RTF_TEST_STRINGS:
|
for test_string in RTF_TEST_STRINGS:
|
||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown() -> None:
|
def test_markitdown() -> None:
|
||||||
"""Tests that MarkItDown correctly loads the plugin."""
|
"""Tests that MarkItDown correctly loads the plugin."""
|
||||||
md = MarkItDown()
|
md = MarkItDown(enable_plugins=True)
|
||||||
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
||||||
|
|
||||||
for test_string in RTF_TEST_STRINGS:
|
for test_string in RTF_TEST_STRINGS:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue