Add support for Path objects in MarkItDown conversion methods
This commit is contained in:
parent
ad5d4fb139
commit
0a7ef79733
2 changed files with 106 additions and 3 deletions
|
|
@ -13,7 +13,8 @@ import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union, overload
|
||||||
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
from warnings import catch_warnings
|
from warnings import catch_warnings
|
||||||
|
|
||||||
|
|
@ -1042,7 +1043,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -1063,10 +1064,14 @@ class MarkItDown:
|
||||||
# Request response
|
# Request response
|
||||||
elif isinstance(source, requests.Response):
|
elif isinstance(source, requests.Response):
|
||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
|
elif isinstance(source, Path):
|
||||||
|
return self.convert_local(source, **kwargs)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: str, **kwargs: Any
|
self, path: Union[str, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
|
if isinstance(path, Path):
|
||||||
|
path = str(path)
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
extensions = [ext] if ext is not None else []
|
extensions = [ext] if ext is not None else []
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||||
|
|
@ -229,8 +230,105 @@ def test_markitdown_exiftool() -> None:
|
||||||
assert target in result.text_content
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_markitdown_local_pathlib() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test XLSX processing
|
||||||
|
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.xlsx")
|
||||||
|
for test_string in XLSX_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test DOCX processing
|
||||||
|
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.docx")
|
||||||
|
for test_string in DOCX_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test DOCX processing, with comments
|
||||||
|
result = markitdown.convert(
|
||||||
|
Path(TEST_FILES_DIR) / "test_with_comment.docx",
|
||||||
|
style_map="comment-reference => ",
|
||||||
|
)
|
||||||
|
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test DOCX processing, with comments and setting style_map on init
|
||||||
|
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||||
|
result = markitdown_with_style_map.convert(
|
||||||
|
Path(TEST_FILES_DIR) / "test_with_comment.docx"
|
||||||
|
)
|
||||||
|
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test PPTX processing
|
||||||
|
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.pptx")
|
||||||
|
for test_string in PPTX_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test HTML processing
|
||||||
|
result = markitdown.convert(
|
||||||
|
Path(TEST_FILES_DIR) / "test_blog.html", url=BLOG_TEST_URL
|
||||||
|
)
|
||||||
|
for test_string in BLOG_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test ZIP file processing
|
||||||
|
result = markitdown.convert(Path(TEST_FILES_DIR) / "test_files.zip")
|
||||||
|
for test_string in DOCX_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test Wikipedia processing
|
||||||
|
result = markitdown.convert(
|
||||||
|
Path(TEST_FILES_DIR) / "test_wikipedia.html", url=WIKIPEDIA_TEST_URL
|
||||||
|
)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in WIKIPEDIA_TEST_EXCLUDES:
|
||||||
|
assert test_string not in text_content
|
||||||
|
for test_string in WIKIPEDIA_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test Bing processing
|
||||||
|
result = markitdown.convert(
|
||||||
|
Path(TEST_FILES_DIR) / "test_serp.html", url=SERP_TEST_URL
|
||||||
|
)
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in SERP_TEST_EXCLUDES:
|
||||||
|
assert test_string not in text_content
|
||||||
|
for test_string in SERP_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test non-UTF-8 encoding
|
||||||
|
result = markitdown.convert(Path(TEST_FILES_DIR) / "test_mskanji.csv")
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in CSV_CP932_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
skip_exiftool,
|
||||||
|
reason="do not run if exiftool is not installed",
|
||||||
|
)
|
||||||
|
def test_markitdown_exiftool_pathlib() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test JPG metadata processing
|
||||||
|
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.jpg")
|
||||||
|
for key in JPG_TEST_EXIFTOOL:
|
||||||
|
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||||
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
|
test_markitdown_local_pathlib()
|
||||||
|
test_markitdown_exiftool_pathlib()
|
||||||
Loading…
Reference in a new issue