Add support for Path objects in MarkItDown conversion methods

This commit is contained in:
SigireddyBalasai 2024-12-17 11:28:08 +00:00
parent ad5d4fb139
commit 0a7ef79733
2 changed files with 106 additions and 3 deletions

View file

@ -13,7 +13,8 @@ import sys
import tempfile import tempfile
import traceback import traceback
import zipfile import zipfile
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union, overload
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import catch_warnings from warnings import catch_warnings
@ -1042,7 +1043,7 @@ class MarkItDown:
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
@ -1063,10 +1064,14 @@ class MarkItDown:
# Request response # Request response
elif isinstance(source, requests.Response): elif isinstance(source, requests.Response):
return self.convert_response(source, **kwargs) return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
return self.convert_local(source, **kwargs)
def convert_local( def convert_local(
self, path: str, **kwargs: Any self, path: Union[str, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
if isinstance(path, Path):
path = str(path)
# Prepare a list of extensions to try (in order of priority) # Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension") ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else [] extensions = [ext] if ext is not None else []

View file

@ -7,6 +7,7 @@ import pytest
import requests import requests
from markitdown import MarkItDown from markitdown import MarkItDown
from pathlib import Path
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
@ -229,8 +230,105 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_local_pathlib() -> None:
markitdown = MarkItDown()
# Test XLSX processing
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.xlsx")
for test_string in XLSX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.docx")
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing, with comments
result = markitdown.convert(
Path(TEST_FILES_DIR) / "test_with_comment.docx",
style_map="comment-reference => ",
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
Path(TEST_FILES_DIR) / "test_with_comment.docx"
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test PPTX processing
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.pptx")
for test_string in PPTX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test HTML processing
result = markitdown.convert(
Path(TEST_FILES_DIR) / "test_blog.html", url=BLOG_TEST_URL
)
for test_string in BLOG_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test ZIP file processing
result = markitdown.convert(Path(TEST_FILES_DIR) / "test_files.zip")
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test Wikipedia processing
result = markitdown.convert(
Path(TEST_FILES_DIR) / "test_wikipedia.html", url=WIKIPEDIA_TEST_URL
)
text_content = result.text_content.replace("\\", "")
for test_string in WIKIPEDIA_TEST_EXCLUDES:
assert test_string not in text_content
for test_string in WIKIPEDIA_TEST_STRINGS:
assert test_string in text_content
# Test Bing processing
result = markitdown.convert(
Path(TEST_FILES_DIR) / "test_serp.html", url=SERP_TEST_URL
)
text_content = result.text_content.replace("\\", "")
for test_string in SERP_TEST_EXCLUDES:
assert test_string not in text_content
for test_string in SERP_TEST_STRINGS:
assert test_string in text_content
# Test non-UTF-8 encoding
result = markitdown.convert(Path(TEST_FILES_DIR) / "test_mskanji.csv")
text_content = result.text_content.replace("\\", "")
for test_string in CSV_CP932_TEST_STRINGS:
assert test_string in text_content
@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool_pathlib() -> None:
markitdown = MarkItDown()
# Test JPG metadata processing
result = markitdown.convert(Path(TEST_FILES_DIR) / "test.jpg")
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
test_markitdown_local_pathlib()
test_markitdown_exiftool_pathlib()