Fixed formatting, and adjusted tests.
This commit is contained in:
parent
887dbbcf5c
commit
959d43c637
8 changed files with 113 additions and 77 deletions
|
|
@ -187,9 +187,15 @@ def main():
|
|||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||
|
||||
if args.filename is None:
|
||||
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
|
||||
result = markitdown.convert_stream(
|
||||
sys.stdin.buffer,
|
||||
stream_info=stream_info,
|
||||
keep_data_uris=args.keep_data_uris,
|
||||
)
|
||||
else:
|
||||
result = markitdown.convert(args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
|
||||
result = markitdown.convert(
|
||||
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
|
||||
)
|
||||
|
||||
_handle_output(args, result)
|
||||
|
||||
|
|
|
|||
|
|
@ -73,6 +73,5 @@ class DocxConverter(HtmlConverter):
|
|||
|
||||
style_map = kwargs.get("style_map", None)
|
||||
return self._html_converter.convert_string(
|
||||
mammoth.convert_to_html(file_stream, style_map=style_map).value,
|
||||
**kwargs
|
||||
mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
|
||||
)
|
||||
|
|
|
|||
|
|
@ -215,7 +215,10 @@ class PptxConverter(DocumentConverter):
|
|||
first_row = False
|
||||
html_table += "</table></body></html>"
|
||||
|
||||
return self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n"
|
||||
return (
|
||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
def _convert_chart_to_markdown(self, chart):
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -76,7 +76,9 @@ class WikipediaConverter(DocumentConverter):
|
|||
main_title = title_elm.string
|
||||
|
||||
# Convert the page
|
||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
|
||||
**kwargs
|
||||
).convert_soup(body_elm)
|
||||
else:
|
||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||
|
||||
|
|
|
|||
|
|
@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter):
|
|||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
|
|
@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter):
|
|||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -10,9 +10,6 @@ class FileTestVector(object):
|
|||
url: str | None
|
||||
must_include: List[str]
|
||||
must_not_include: List[str]
|
||||
# in test keep_data_uris cases, we want to ensure that the data URIs are kept
|
||||
must_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
|
||||
must_not_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
|
||||
|
||||
|
||||
GENERAL_TEST_VECTORS = [
|
||||
|
|
@ -33,12 +30,6 @@ GENERAL_TEST_VECTORS = [
|
|||
must_not_include=[
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||
],
|
||||
must_include_with_data_uris=[
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||
],
|
||||
must_not_include_with_data_uris=[
|
||||
"data:image/png;base64...",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.xlsx",
|
||||
|
|
@ -77,17 +68,9 @@ GENERAL_TEST_VECTORS = [
|
|||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||
"2003", # chart value
|
||||
"![This phrase of the caption is Human-written.]", # image caption
|
||||
],
|
||||
must_not_include=[
|
||||
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"
|
||||
],
|
||||
must_include_with_data_uris=[
|
||||
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
|
||||
],
|
||||
must_not_include_with_data_uris=[
|
||||
"",
|
||||
],
|
||||
must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_outlook_msg.msg",
|
||||
|
|
@ -251,3 +234,45 @@ GENERAL_TEST_VECTORS = [
|
|||
must_not_include=[],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
DATA_URI_TEST_VECTORS = [
|
||||
FileTestVector(
|
||||
filename="test.docx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
||||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
|
||||
"# Abstract",
|
||||
"# Introduction",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||
],
|
||||
must_not_include=[
|
||||
"data:image/png;base64...",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.pptx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
|
||||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
|
||||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
|
||||
"1b92870d-e3b5-4e65-8153-919f4ff45592",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||
"2003", # chart value
|
||||
"![This phrase of the caption is Human-written.]", # image caption
|
||||
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
|
||||
],
|
||||
must_not_include=[
|
||||
"",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
|
|||
|
|
@ -7,9 +7,17 @@ import locale
|
|||
from typing import List
|
||||
|
||||
if __name__ == "__main__":
|
||||
from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
|
||||
from _test_vectors import (
|
||||
GENERAL_TEST_VECTORS,
|
||||
DATA_URI_TEST_VECTORS,
|
||||
FileTestVector,
|
||||
)
|
||||
else:
|
||||
from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
|
||||
from ._test_vectors import (
|
||||
GENERAL_TEST_VECTORS,
|
||||
DATA_URI_TEST_VECTORS,
|
||||
FileTestVector,
|
||||
)
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
|
|
@ -149,7 +157,7 @@ def test_convert_url(shared_tmp_dir, test_vector):
|
|||
assert test_string not in stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
||||
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
|
||||
"""Test CLI functionality when keep_data_uris is enabled"""
|
||||
|
||||
|
|
@ -173,21 +181,10 @@ def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
|
|||
|
||||
with open(output_file, "r") as f:
|
||||
output_data = f.read()
|
||||
for test_string in test_vector.must_include_with_data_uris:
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in output_data
|
||||
for test_string in test_vector.must_not_include_with_data_uris:
|
||||
for test_string in test_vector.must_not_include:
|
||||
assert test_string not in output_data
|
||||
# Verify that basic test conditions are still met
|
||||
for string in test_vector.must_include:
|
||||
if "data:image" in string:
|
||||
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||
continue
|
||||
assert string in output_data
|
||||
for string in test_vector.must_not_include:
|
||||
if "data:image" in string:
|
||||
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||
continue
|
||||
assert string not in output_data
|
||||
|
||||
os.remove(output_file)
|
||||
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
|
||||
|
|
@ -200,12 +197,12 @@ if __name__ == "__main__":
|
|||
"""Runs this file's tests from the command line."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# General tests
|
||||
for test_function in [
|
||||
test_output_to_stdout,
|
||||
test_output_to_file,
|
||||
test_input_from_stdin_without_hints,
|
||||
test_convert_url,
|
||||
test_output_to_file_with_data_uris,
|
||||
]:
|
||||
for test_vector in CLI_TEST_VECTORS:
|
||||
print(
|
||||
|
|
@ -214,4 +211,17 @@ if __name__ == "__main__":
|
|||
)
|
||||
test_function(tmp_dir, test_vector)
|
||||
print("OK")
|
||||
|
||||
# Data URI tests
|
||||
for test_function in [
|
||||
test_output_to_file_with_data_uris,
|
||||
]:
|
||||
for test_vector in DATA_URI_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...",
|
||||
end="",
|
||||
)
|
||||
test_function(tmp_dir, test_vector)
|
||||
print("OK")
|
||||
|
||||
print("All tests passed!")
|
||||
|
|
|
|||
|
|
@ -6,9 +6,9 @@ import codecs
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from _test_vectors import GENERAL_TEST_VECTORS
|
||||
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
||||
else:
|
||||
from ._test_vectors import GENERAL_TEST_VECTORS
|
||||
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
|
|
@ -124,7 +124,7 @@ def test_convert_url(test_vector):
|
|||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
||||
def test_convert_with_data_uris(test_vector):
|
||||
"""Test API functionality when keep_data_uris is enabled"""
|
||||
markitdown = MarkItDown()
|
||||
|
|
@ -133,29 +133,16 @@ def test_convert_with_data_uris(test_vector):
|
|||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
keep_data_uris=True,
|
||||
url=test_vector.url
|
||||
url=test_vector.url,
|
||||
)
|
||||
|
||||
# Verify keep_data_uris related test conditions
|
||||
for string in test_vector.must_include_with_data_uris:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include_with_data_uris:
|
||||
assert string not in result.markdown
|
||||
|
||||
# Verify that basic test conditions are still met
|
||||
for string in test_vector.must_include:
|
||||
if "data:image" in string:
|
||||
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||
continue
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
if "data:image" in string:
|
||||
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||
continue
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
||||
def test_convert_stream_with_data_uris(test_vector):
|
||||
"""Test the conversion of a stream with no stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
|
@ -168,25 +155,12 @@ def test_convert_stream_with_data_uris(test_vector):
|
|||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream,
|
||||
stream_info=stream_info,
|
||||
keep_data_uris=True,
|
||||
url=test_vector.url
|
||||
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
|
||||
)
|
||||
|
||||
# Verify keep_data_uris related test conditions
|
||||
for string in test_vector.must_include_with_data_uris:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include_with_data_uris:
|
||||
assert string not in result.markdown
|
||||
|
||||
# Verify that basic test conditions are still met
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||
if "data:image" in string:
|
||||
continue
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
|
|
@ -194,14 +168,14 @@ if __name__ == "__main__":
|
|||
import sys
|
||||
|
||||
"""Runs this file's tests from the command line."""
|
||||
|
||||
# General tests
|
||||
for test_function in [
|
||||
test_guess_stream_info,
|
||||
test_convert_local,
|
||||
test_convert_stream_with_hints,
|
||||
test_convert_stream_without_hints,
|
||||
test_convert_url,
|
||||
test_convert_with_data_uris,
|
||||
test_convert_stream_with_data_uris,
|
||||
]:
|
||||
for test_vector in GENERAL_TEST_VECTORS:
|
||||
print(
|
||||
|
|
@ -209,4 +183,17 @@ if __name__ == "__main__":
|
|||
)
|
||||
test_function(test_vector)
|
||||
print("OK")
|
||||
|
||||
# Data URI tests
|
||||
for test_function in [
|
||||
test_convert_with_data_uris,
|
||||
test_convert_stream_with_data_uris,
|
||||
]:
|
||||
for test_vector in DATA_URI_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
||||
)
|
||||
test_function(test_vector)
|
||||
print("OK")
|
||||
|
||||
print("All tests passed!")
|
||||
|
|
|
|||
Loading…
Reference in a new issue