Add support for Path objects in MarkItDown conversion methods
This commit is contained in:
parent
ad5d4fb139
commit
0a7ef79733
2 changed files with 106 additions and 3 deletions
|
|
@ -13,7 +13,8 @@ import sys
|
|||
import tempfile
|
||||
import traceback
|
||||
import zipfile
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union, overload
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from warnings import catch_warnings
|
||||
|
||||
|
|
@ -1042,7 +1043,7 @@ class MarkItDown:
|
|||
self.register_page_converter(ZipConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
|
|
@ -1063,10 +1064,14 @@ class MarkItDown:
|
|||
# Request response
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, **kwargs)
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, **kwargs)
|
||||
|
||||
def convert_local(
|
||||
self, path: str, **kwargs: Any
|
||||
self, path: Union[str, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
if isinstance(path, Path):
|
||||
path = str(path)
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
extensions = [ext] if ext is not None else []
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import pytest
|
|||
import requests
|
||||
|
||||
from markitdown import MarkItDown
|
||||
from pathlib import Path
|
||||
|
||||
skip_remote = (
|
||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||
|
|
@ -229,8 +230,105 @@ def test_markitdown_exiftool() -> None:
|
|||
assert target in result.text_content
|
||||
|
||||
|
||||
|
||||
def test_markitdown_local_pathlib() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test XLSX processing
|
||||
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.xlsx")
|
||||
for test_string in XLSX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test DOCX processing
|
||||
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.docx")
|
||||
for test_string in DOCX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test DOCX processing, with comments
|
||||
result = markitdown.convert(
|
||||
Path(TEST_FILES_DIR) / "test_with_comment.docx",
|
||||
style_map="comment-reference => ",
|
||||
)
|
||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test DOCX processing, with comments and setting style_map on init
|
||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||
result = markitdown_with_style_map.convert(
|
||||
Path(TEST_FILES_DIR) / "test_with_comment.docx"
|
||||
)
|
||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test PPTX processing
|
||||
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.pptx")
|
||||
for test_string in PPTX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test HTML processing
|
||||
result = markitdown.convert(
|
||||
Path(TEST_FILES_DIR) / "test_blog.html", url=BLOG_TEST_URL
|
||||
)
|
||||
for test_string in BLOG_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test ZIP file processing
|
||||
result = markitdown.convert(Path(TEST_FILES_DIR) / "test_files.zip")
|
||||
for test_string in DOCX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test Wikipedia processing
|
||||
result = markitdown.convert(
|
||||
Path(TEST_FILES_DIR) / "test_wikipedia.html", url=WIKIPEDIA_TEST_URL
|
||||
)
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in WIKIPEDIA_TEST_EXCLUDES:
|
||||
assert test_string not in text_content
|
||||
for test_string in WIKIPEDIA_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
# Test Bing processing
|
||||
result = markitdown.convert(
|
||||
Path(TEST_FILES_DIR) / "test_serp.html", url=SERP_TEST_URL
|
||||
)
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in SERP_TEST_EXCLUDES:
|
||||
assert test_string not in text_content
|
||||
for test_string in SERP_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
# Test non-UTF-8 encoding
|
||||
result = markitdown.convert(Path(TEST_FILES_DIR) / "test_mskanji.csv")
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in CSV_CP932_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_exiftool,
|
||||
reason="do not run if exiftool is not installed",
|
||||
)
|
||||
def test_markitdown_exiftool_pathlib() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test JPG metadata processing
|
||||
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.jpg")
|
||||
for key in JPG_TEST_EXIFTOOL:
|
||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||
assert target in result.text_content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_markitdown_remote()
|
||||
test_markitdown_local()
|
||||
test_markitdown_exiftool()
|
||||
test_markitdown_local_pathlib()
|
||||
test_markitdown_exiftool_pathlib()
|
||||
Loading…
Reference in a new issue