diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6a5d01b..6d3d3c7 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -104,6 +104,12 @@ def main(): help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", ) + parser.add_argument( + "--keep-data-uris", + action="store_true", + help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -181,9 +187,9 @@ def main(): markitdown = MarkItDown(enable_plugins=args.use_plugins) if args.filename is None: - result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info) + result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris) else: - result = markitdown.convert(args.filename, stream_info=stream_info) + result = markitdown.convert(args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 4108990..f65b85f 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -79,9 +79,7 @@ class BingSerpConverter(DocumentConverter): slug.extract() # Parse the algorithmic results - _markdownify = _CustomMarkdownify( - keep_data_uris=kwargs.get("keep_data_uris", False) - ) + _markdownify = _CustomMarkdownify(**kwargs) results = list() for result in soup.find_all(class_="b_algo"): if not hasattr(result, "find_all"): diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 44ff85a..4625707 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -74,5 +74,5 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) return self._html_converter.convert_string( mammoth.convert_to_html(file_stream, style_map=style_map).value, - keep_data_uris=kwargs.get("keep_data_uris", False), + **kwargs ) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 91595de..dabb0d7 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -55,15 +55,10 @@ class HtmlConverter(DocumentConverter): # Print only the main content body_elm = soup.find("body") webpage_text = "" - keep_data_uris = kwargs.get("keep_data_uris", False) if body_elm: - webpage_text = _CustomMarkdownify( - keep_data_uris=keep_data_uris - ).convert_soup(body_elm) + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify( - keep_data_uris=keep_data_uris - ).convert_soup(soup) + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) assert isinstance(webpage_text, str) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index d98e1a3..d98bdfb 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -17,7 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) - self.keep_data_uris = options.pop("keep_data_uris", False) + options["keep_data_uris"] = options.get("keep_data_uris", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) @@ -102,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): return alt # Remove dataURIs - if src.startswith("data:") and not self.keep_data_uris: + if src.startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index a45a507..bcf89a8 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -78,9 +78,6 @@ class PptxConverter(DocumentConverter): _dependency_exc_info[2] ) - # Get the keep_data_uris parameter - keep_data_uris = kwargs.get("keep_data_uris", False) - # Perform the conversion presentation = pptx.Presentation(file_stream) md_content = "" @@ -144,7 +141,7 @@ class PptxConverter(DocumentConverter): alt_text = re.sub(r"\s+", " ", alt_text).strip() # If keep_data_uris is True, use base64 encoding for images - if keep_data_uris: + if kwargs.get("keep_data_uris", False): blob = shape.image.blob content_type = shape.image.content_type or "image/png" b64_string = base64.b64encode(blob).decode("utf-8") @@ -156,7 +153,7 @@ class PptxConverter(DocumentConverter): # Tables if self._is_table(shape): - md_content += self._convert_table_to_markdown(shape.table) + md_content += self._convert_table_to_markdown(shape.table, **kwargs) # Charts if shape.has_chart: @@ -203,7 +200,7 @@ class PptxConverter(DocumentConverter): return True return False - def _convert_table_to_markdown(self, table): + def _convert_table_to_markdown(self, table, **kwargs): # Write the table as HTML, then convert it to Markdown html_table = "" first_row = True @@ -218,7 +215,7 @@ class PptxConverter(DocumentConverter): first_row = False html_table += "
" - return self._html_converter.convert_string(html_table).markdown.strip() + "\n" + return self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n" def _convert_chart_to_markdown(self, chart): try: diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 9a4e881..6a0e4c1 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -171,9 +171,7 @@ class RssConverter(DocumentConverter): try: # using bs4 because many RSS feeds have HTML-styled content soup = BeautifulSoup(content, "html.parser") - return _CustomMarkdownify( - keep_data_uris=self._kwargs.get("keep_data_uris", False) - ).convert_soup(soup) + return _CustomMarkdownify(**self._kwargs).convert_soup(soup) except BaseException as _: return content diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index fb49bad..68d7531 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -76,13 +76,9 @@ class WikipediaConverter(DocumentConverter): main_title = title_elm.string # Convert the page - webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify( - keep_data_uris=kwargs.get("keep_data_uris", False) - ).convert_soup(body_elm) + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(**kwargs).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify( - keep_data_uris=kwargs.get("keep_data_uris", False) - ).convert_soup(soup) + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) return DocumentConverterResult( markdown=webpage_text, diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 3d0e1ab..1b337e8 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -86,7 +86,7 @@ class XlsxConverter(DocumentConverter): md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( - self._html_converter.convert_string(html_content).markdown.strip() + self._html_converter.convert_string(html_content, **kwargs).markdown.strip() + "\n\n" ) @@ -146,7 +146,7 @@ class XlsConverter(DocumentConverter): md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( - self._html_converter.convert_string(html_content).markdown.strip() + self._html_converter.convert_string(html_content, **kwargs).markdown.strip() + "\n\n" ) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 8610108..5e37674 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -10,6 +10,9 @@ class FileTestVector(object): url: str | None must_include: List[str] must_not_include: List[str] + # in test keep_data_uris cases, we want to ensure that the data URIs are kept + must_include_with_data_uris: List[str] = dataclasses.field(default_factory=list) + must_not_include_with_data_uris: List[str] = dataclasses.field(default_factory=list) GENERAL_TEST_VECTORS = [ @@ -25,8 +28,17 @@ GENERAL_TEST_VECTORS = [ "# Abstract", "# Introduction", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "data:image/png;base64...", + ], + must_not_include=[ + "data:image/png;base64,iVBORw0KGgoAAAANSU", + ], + must_include_with_data_uris=[ + "data:image/png;base64,iVBORw0KGgoAAAANSU", + ], + must_not_include_with_data_uris=[ + "data:image/png;base64...", ], - must_not_include=[], ), FileTestVector( filename="test.xlsx", @@ -65,8 +77,17 @@ GENERAL_TEST_VECTORS = [ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value + "![This phrase of the caption is Human-written.]", # image caption + ], + must_not_include=[ + "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE" + ], + must_include_with_data_uris=[ + "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE", + ], + must_not_include_with_data_uris=[ + "![This phrase of the caption is Human-written.](Picture4.jpg)", ], - must_not_include=[], ), FileTestVector( filename="test_outlook_msg.msg", diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py index 64128d6..43c114b 100644 --- a/packages/markitdown/tests/test_cli_vectors.py +++ b/packages/markitdown/tests/test_cli_vectors.py @@ -149,6 +149,50 @@ def test_convert_url(shared_tmp_dir, test_vector): assert test_string not in stdout +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: + """Test CLI functionality when keep_data_uris is enabled""" + + output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + "--keep-data-uris", + "-o", + output_file, + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert os.path.exists(output_file), f"Output file not created: {output_file}" + + with open(output_file, "r") as f: + output_data = f.read() + for test_string in test_vector.must_include_with_data_uris: + assert test_string in output_data + for test_string in test_vector.must_not_include_with_data_uris: + assert test_string not in output_data + # Verify that basic test conditions are still met + for string in test_vector.must_include: + if "data:image" in string: + # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) + continue + assert string in output_data + for string in test_vector.must_not_include: + if "data:image" in string: + # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) + continue + assert string not in output_data + + os.remove(output_file) + assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" + + if __name__ == "__main__": import sys import tempfile @@ -161,6 +205,7 @@ if __name__ == "__main__": test_output_to_file, test_input_from_stdin_without_hints, test_convert_url, + test_output_to_file_with_data_uris, ]: for test_vector in CLI_TEST_VECTORS: print( diff --git a/packages/markitdown/tests/test_files/test.docx b/packages/markitdown/tests/test_files/test.docx index 984018a..79e281d 100644 Binary files a/packages/markitdown/tests/test_files/test.docx and b/packages/markitdown/tests/test_files/test.docx differ diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py index 9afffa5..05890ec 100644 --- a/packages/markitdown/tests/test_module_vectors.py +++ b/packages/markitdown/tests/test_module_vectors.py @@ -124,6 +124,72 @@ def test_convert_url(test_vector): assert string not in result.markdown +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_with_data_uris(test_vector): + """Test API functionality when keep_data_uris is enabled""" + markitdown = MarkItDown() + + # Test local file conversion + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, test_vector.filename), + keep_data_uris=True, + url=test_vector.url + ) + + # Verify keep_data_uris related test conditions + for string in test_vector.must_include_with_data_uris: + assert string in result.markdown + for string in test_vector.must_not_include_with_data_uris: + assert string not in result.markdown + + # Verify that basic test conditions are still met + for string in test_vector.must_include: + if "data:image" in string: + # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) + continue + assert string in result.markdown + for string in test_vector.must_not_include: + if "data:image" in string: + # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) + continue + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_stream_with_data_uris(test_vector): + """Test the conversion of a stream with no stream info.""" + markitdown = MarkItDown() + + stream_info = StreamInfo( + extension=os.path.splitext(test_vector.filename)[1], + mimetype=test_vector.mimetype, + charset=test_vector.charset, + ) + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert( + stream, + stream_info=stream_info, + keep_data_uris=True, + url=test_vector.url + ) + + # Verify keep_data_uris related test conditions + for string in test_vector.must_include_with_data_uris: + assert string in result.markdown + for string in test_vector.must_not_include_with_data_uris: + assert string not in result.markdown + + # Verify that basic test conditions are still met + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image) + if "data:image" in string: + continue + assert string not in result.markdown + + if __name__ == "__main__": import sys @@ -134,6 +200,8 @@ if __name__ == "__main__": test_convert_stream_with_hints, test_convert_stream_without_hints, test_convert_url, + test_convert_with_data_uris, + test_convert_stream_with_data_uris, ]: for test_vector in GENERAL_TEST_VECTORS: print(