diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6d3d3c7..3d8e396 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -187,9 +187,15 @@ def main(): markitdown = MarkItDown(enable_plugins=args.use_plugins) if args.filename is None: - result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris) + result = markitdown.convert_stream( + sys.stdin.buffer, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + ) else: - result = markitdown.convert(args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris) + result = markitdown.convert( + args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 4625707..a9c469f 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -73,6 +73,5 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) return self._html_converter.convert_string( - mammoth.convert_to_html(file_stream, style_map=style_map).value, - **kwargs + mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs ) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index bcf89a8..087da32 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -215,7 +215,10 @@ class PptxConverter(DocumentConverter): first_row = False html_table += "" - return self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n" + return ( + self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + + "\n" + ) def _convert_chart_to_markdown(self, chart): try: diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 68d7531..c0f7e0e 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -76,7 +76,9 @@ class WikipediaConverter(DocumentConverter): main_title = title_elm.string # Convert the page - webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(**kwargs).convert_soup(body_elm) + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify( + **kwargs + ).convert_soup(body_elm) else: webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 1b337e8..28f73a0 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter): md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( - self._html_converter.convert_string(html_content, **kwargs).markdown.strip() + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + "\n\n" ) @@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter): md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( - self._html_converter.convert_string(html_content, **kwargs).markdown.strip() + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + "\n\n" ) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 5e37674..4a7b54a 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -10,9 +10,6 @@ class FileTestVector(object): url: str | None must_include: List[str] must_not_include: List[str] - # in test keep_data_uris cases, we want to ensure that the data URIs are kept - must_include_with_data_uris: List[str] = dataclasses.field(default_factory=list) - must_not_include_with_data_uris: List[str] = dataclasses.field(default_factory=list) GENERAL_TEST_VECTORS = [ @@ -33,12 +30,6 @@ GENERAL_TEST_VECTORS = [ must_not_include=[ "data:image/png;base64,iVBORw0KGgoAAAANSU", ], - must_include_with_data_uris=[ - "data:image/png;base64,iVBORw0KGgoAAAANSU", - ], - must_not_include_with_data_uris=[ - "data:image/png;base64...", - ], ), FileTestVector( filename="test.xlsx", @@ -77,17 +68,9 @@ GENERAL_TEST_VECTORS = [ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value - "![This phrase of the caption is Human-written.]", # image caption - ], - must_not_include=[ - "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE" - ], - must_include_with_data_uris=[ - "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE", - ], - must_not_include_with_data_uris=[ "![This phrase of the caption is Human-written.](Picture4.jpg)", ], + must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"], ), FileTestVector( filename="test_outlook_msg.msg", @@ -251,3 +234,45 @@ GENERAL_TEST_VECTORS = [ must_not_include=[], ), ] + + +DATA_URI_TEST_VECTORS = [ + FileTestVector( + filename="test.docx", + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + charset=None, + url=None, + must_include=[ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "data:image/png;base64,iVBORw0KGgoAAAANSU", + ], + must_not_include=[ + "data:image/png;base64...", + ], + ), + FileTestVector( + filename="test.pptx", + mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", + charset=None, + url=None, + must_include=[ + "2cdda5c8-e50e-4db4-b5f0-9722a649f455", + "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", + "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", + "1b92870d-e3b5-4e65-8153-919f4ff45592", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value + "![This phrase of the caption is Human-written.]", # image caption + "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE", + ], + must_not_include=[ + "![This phrase of the caption is Human-written.](Picture4.jpg)", + ], + ), +] diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py index 43c114b..6030482 100644 --- a/packages/markitdown/tests/test_cli_vectors.py +++ b/packages/markitdown/tests/test_cli_vectors.py @@ -7,9 +7,17 @@ import locale from typing import List if __name__ == "__main__": - from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector + from _test_vectors import ( + GENERAL_TEST_VECTORS, + DATA_URI_TEST_VECTORS, + FileTestVector, + ) else: - from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector + from ._test_vectors import ( + GENERAL_TEST_VECTORS, + DATA_URI_TEST_VECTORS, + FileTestVector, + ) from markitdown import ( MarkItDown, @@ -149,7 +157,7 @@ def test_convert_url(shared_tmp_dir, test_vector): assert test_string not in stdout -@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: """Test CLI functionality when keep_data_uris is enabled""" @@ -173,21 +181,10 @@ def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: with open(output_file, "r") as f: output_data = f.read() - for test_string in test_vector.must_include_with_data_uris: + for test_string in test_vector.must_include: assert test_string in output_data - for test_string in test_vector.must_not_include_with_data_uris: + for test_string in test_vector.must_not_include: assert test_string not in output_data - # Verify that basic test conditions are still met - for string in test_vector.must_include: - if "data:image" in string: - # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) - continue - assert string in output_data - for string in test_vector.must_not_include: - if "data:image" in string: - # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) - continue - assert string not in output_data os.remove(output_file) assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" @@ -200,12 +197,12 @@ if __name__ == "__main__": """Runs this file's tests from the command line.""" with tempfile.TemporaryDirectory() as tmp_dir: + # General tests for test_function in [ test_output_to_stdout, test_output_to_file, test_input_from_stdin_without_hints, test_convert_url, - test_output_to_file_with_data_uris, ]: for test_vector in CLI_TEST_VECTORS: print( @@ -214,4 +211,17 @@ if __name__ == "__main__": ) test_function(tmp_dir, test_vector) print("OK") + + # Data URI tests + for test_function in [ + test_output_to_file_with_data_uris, + ]: + for test_vector in DATA_URI_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", + end="", + ) + test_function(tmp_dir, test_vector) + print("OK") + print("All tests passed!") diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py index 05890ec..09e4a2b 100644 --- a/packages/markitdown/tests/test_module_vectors.py +++ b/packages/markitdown/tests/test_module_vectors.py @@ -6,9 +6,9 @@ import codecs if __name__ == "__main__": - from _test_vectors import GENERAL_TEST_VECTORS + from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS else: - from ._test_vectors import GENERAL_TEST_VECTORS + from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS from markitdown import ( MarkItDown, @@ -124,7 +124,7 @@ def test_convert_url(test_vector): assert string not in result.markdown -@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) def test_convert_with_data_uris(test_vector): """Test API functionality when keep_data_uris is enabled""" markitdown = MarkItDown() @@ -133,29 +133,16 @@ def test_convert_with_data_uris(test_vector): result = markitdown.convert( os.path.join(TEST_FILES_DIR, test_vector.filename), keep_data_uris=True, - url=test_vector.url + url=test_vector.url, ) - # Verify keep_data_uris related test conditions - for string in test_vector.must_include_with_data_uris: - assert string in result.markdown - for string in test_vector.must_not_include_with_data_uris: - assert string not in result.markdown - - # Verify that basic test conditions are still met for string in test_vector.must_include: - if "data:image" in string: - # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) - continue assert string in result.markdown for string in test_vector.must_not_include: - if "data:image" in string: - # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) - continue assert string not in result.markdown -@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) def test_convert_stream_with_data_uris(test_vector): """Test the conversion of a stream with no stream info.""" markitdown = MarkItDown() @@ -168,25 +155,12 @@ def test_convert_stream_with_data_uris(test_vector): with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: result = markitdown.convert( - stream, - stream_info=stream_info, - keep_data_uris=True, - url=test_vector.url + stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url ) - # Verify keep_data_uris related test conditions - for string in test_vector.must_include_with_data_uris: - assert string in result.markdown - for string in test_vector.must_not_include_with_data_uris: - assert string not in result.markdown - - # Verify that basic test conditions are still met for string in test_vector.must_include: assert string in result.markdown for string in test_vector.must_not_include: - # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) - if "data:image" in string: - continue assert string not in result.markdown @@ -194,14 +168,14 @@ if __name__ == "__main__": import sys """Runs this file's tests from the command line.""" + + # General tests for test_function in [ test_guess_stream_info, test_convert_local, test_convert_stream_with_hints, test_convert_stream_without_hints, test_convert_url, - test_convert_with_data_uris, - test_convert_stream_with_data_uris, ]: for test_vector in GENERAL_TEST_VECTORS: print( @@ -209,4 +183,17 @@ if __name__ == "__main__": ) test_function(test_vector) print("OK") + + # Data URI tests + for test_function in [ + test_convert_with_data_uris, + test_convert_stream_with_data_uris, + ]: + for test_vector in DATA_URI_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", end="" + ) + test_function(test_vector) + print("OK") + print("All tests passed!")