Use *kwarg to pass keep_data_uri para.

Add module cli vector tests
This commit is contained in:
Yuzhong Zhang 2025-03-21 00:49:36 +08:00
parent 4899148310
commit 1eaa879b25
13 changed files with 159 additions and 35 deletions

View file

@ -104,6 +104,12 @@ def main():
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
) )
parser.add_argument(
"--keep-data-uris",
action="store_true",
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)
parser.add_argument("filename", nargs="?") parser.add_argument("filename", nargs="?")
args = parser.parse_args() args = parser.parse_args()
@ -181,9 +187,9 @@ def main():
markitdown = MarkItDown(enable_plugins=args.use_plugins) markitdown = MarkItDown(enable_plugins=args.use_plugins)
if args.filename is None: if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info) result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
else: else:
result = markitdown.convert(args.filename, stream_info=stream_info) result = markitdown.convert(args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
_handle_output(args, result) _handle_output(args, result)

View file

@ -79,9 +79,7 @@ class BingSerpConverter(DocumentConverter):
slug.extract() slug.extract()
# Parse the algorithmic results # Parse the algorithmic results
_markdownify = _CustomMarkdownify( _markdownify = _CustomMarkdownify(**kwargs)
keep_data_uris=kwargs.get("keep_data_uris", False)
)
results = list() results = list()
for result in soup.find_all(class_="b_algo"): for result in soup.find_all(class_="b_algo"):
if not hasattr(result, "find_all"): if not hasattr(result, "find_all"):

View file

@ -74,5 +74,5 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
return self._html_converter.convert_string( return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value, mammoth.convert_to_html(file_stream, style_map=style_map).value,
keep_data_uris=kwargs.get("keep_data_uris", False), **kwargs
) )

View file

@ -55,15 +55,10 @@ class HtmlConverter(DocumentConverter):
# Print only the main content # Print only the main content
body_elm = soup.find("body") body_elm = soup.find("body")
webpage_text = "" webpage_text = ""
keep_data_uris = kwargs.get("keep_data_uris", False)
if body_elm: if body_elm:
webpage_text = _CustomMarkdownify( webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
keep_data_uris=keep_data_uris
).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify( webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
keep_data_uris=keep_data_uris
).convert_soup(soup)
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)

View file

@ -17,7 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any): def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get("heading_style", markdownify.ATX)
self.keep_data_uris = options.pop("keep_data_uris", False) options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
@ -102,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt return alt
# Remove dataURIs # Remove dataURIs
if src.startswith("data:") and not self.keep_data_uris: if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..." src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part) return "![%s](%s%s)" % (alt, src, title_part)

View file

@ -78,9 +78,6 @@ class PptxConverter(DocumentConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) )
# Get the keep_data_uris parameter
keep_data_uris = kwargs.get("keep_data_uris", False)
# Perform the conversion # Perform the conversion
presentation = pptx.Presentation(file_stream) presentation = pptx.Presentation(file_stream)
md_content = "" md_content = ""
@ -144,7 +141,7 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"\s+", " ", alt_text).strip() alt_text = re.sub(r"\s+", " ", alt_text).strip()
# If keep_data_uris is True, use base64 encoding for images # If keep_data_uris is True, use base64 encoding for images
if keep_data_uris: if kwargs.get("keep_data_uris", False):
blob = shape.image.blob blob = shape.image.blob
content_type = shape.image.content_type or "image/png" content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8") b64_string = base64.b64encode(blob).decode("utf-8")
@ -156,7 +153,7 @@ class PptxConverter(DocumentConverter):
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table) md_content += self._convert_table_to_markdown(shape.table, **kwargs)
# Charts # Charts
if shape.has_chart: if shape.has_chart:
@ -203,7 +200,7 @@ class PptxConverter(DocumentConverter):
return True return True
return False return False
def _convert_table_to_markdown(self, table): def _convert_table_to_markdown(self, table, **kwargs):
# Write the table as HTML, then convert it to Markdown # Write the table as HTML, then convert it to Markdown
html_table = "<html><body><table>" html_table = "<html><body><table>"
first_row = True first_row = True
@ -218,7 +215,7 @@ class PptxConverter(DocumentConverter):
first_row = False first_row = False
html_table += "</table></body></html>" html_table += "</table></body></html>"
return self._html_converter.convert_string(html_table).markdown.strip() + "\n" return self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n"
def _convert_chart_to_markdown(self, chart): def _convert_chart_to_markdown(self, chart):
try: try:

View file

@ -171,9 +171,7 @@ class RssConverter(DocumentConverter):
try: try:
# using bs4 because many RSS feeds have HTML-styled content # using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify( return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
keep_data_uris=self._kwargs.get("keep_data_uris", False)
).convert_soup(soup)
except BaseException as _: except BaseException as _:
return content return content

View file

@ -76,13 +76,9 @@ class WikipediaConverter(DocumentConverter):
main_title = title_elm.string main_title = title_elm.string
# Convert the page # Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify( webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(**kwargs).convert_soup(body_elm)
keep_data_uris=kwargs.get("keep_data_uris", False)
).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify( webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
keep_data_uris=kwargs.get("keep_data_uris", False)
).convert_soup(soup)
return DocumentConverterResult( return DocumentConverterResult(
markdown=webpage_text, markdown=webpage_text,

View file

@ -86,7 +86,7 @@ class XlsxConverter(DocumentConverter):
md_content += f"## {s}\n" md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False) html_content = sheets[s].to_html(index=False)
md_content += ( md_content += (
self._html_converter.convert_string(html_content).markdown.strip() self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
+ "\n\n" + "\n\n"
) )
@ -146,7 +146,7 @@ class XlsConverter(DocumentConverter):
md_content += f"## {s}\n" md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False) html_content = sheets[s].to_html(index=False)
md_content += ( md_content += (
self._html_converter.convert_string(html_content).markdown.strip() self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
+ "\n\n" + "\n\n"
) )

View file

@ -10,6 +10,9 @@ class FileTestVector(object):
url: str | None url: str | None
must_include: List[str] must_include: List[str]
must_not_include: List[str] must_not_include: List[str]
# in test keep_data_uris cases, we want to ensure that the data URIs are kept
must_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
must_not_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
GENERAL_TEST_VECTORS = [ GENERAL_TEST_VECTORS = [
@ -25,8 +28,17 @@ GENERAL_TEST_VECTORS = [
"# Abstract", "# Abstract",
"# Introduction", "# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"data:image/png;base64...",
],
must_not_include=[
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
must_include_with_data_uris=[
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
must_not_include_with_data_uris=[
"data:image/png;base64...",
], ],
must_not_include=[],
), ),
FileTestVector( FileTestVector(
filename="test.xlsx", filename="test.xlsx",
@ -65,8 +77,17 @@ GENERAL_TEST_VECTORS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value "2003", # chart value
"![This phrase of the caption is Human-written.]", # image caption
],
must_not_include=[
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"
],
must_include_with_data_uris=[
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
],
must_not_include_with_data_uris=[
"![This phrase of the caption is Human-written.](Picture4.jpg)",
], ],
must_not_include=[],
), ),
FileTestVector( FileTestVector(
filename="test_outlook_msg.msg", filename="test_outlook_msg.msg",

View file

@ -149,6 +149,50 @@ def test_convert_url(shared_tmp_dir, test_vector):
assert test_string not in stdout assert test_string not in stdout
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
"""Test CLI functionality when keep_data_uris is enabled"""
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
result = subprocess.run(
[
"python",
"-m",
"markitdown",
"--keep-data-uris",
"-o",
output_file,
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
assert os.path.exists(output_file), f"Output file not created: {output_file}"
with open(output_file, "r") as f:
output_data = f.read()
for test_string in test_vector.must_include_with_data_uris:
assert test_string in output_data
for test_string in test_vector.must_not_include_with_data_uris:
assert test_string not in output_data
# Verify that basic test conditions are still met
for string in test_vector.must_include:
if "data:image" in string:
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
continue
assert string in output_data
for string in test_vector.must_not_include:
if "data:image" in string:
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
continue
assert string not in output_data
os.remove(output_file)
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
import tempfile import tempfile
@ -161,6 +205,7 @@ if __name__ == "__main__":
test_output_to_file, test_output_to_file,
test_input_from_stdin_without_hints, test_input_from_stdin_without_hints,
test_convert_url, test_convert_url,
test_output_to_file_with_data_uris,
]: ]:
for test_vector in CLI_TEST_VECTORS: for test_vector in CLI_TEST_VECTORS:
print( print(

Binary file not shown.

View file

@ -124,6 +124,72 @@ def test_convert_url(test_vector):
assert string not in result.markdown assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
def test_convert_with_data_uris(test_vector):
"""Test API functionality when keep_data_uris is enabled"""
markitdown = MarkItDown()
# Test local file conversion
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, test_vector.filename),
keep_data_uris=True,
url=test_vector.url
)
# Verify keep_data_uris related test conditions
for string in test_vector.must_include_with_data_uris:
assert string in result.markdown
for string in test_vector.must_not_include_with_data_uris:
assert string not in result.markdown
# Verify that basic test conditions are still met
for string in test_vector.must_include:
if "data:image" in string:
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
continue
assert string in result.markdown
for string in test_vector.must_not_include:
if "data:image" in string:
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
continue
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
def test_convert_stream_with_data_uris(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
stream_info = StreamInfo(
extension=os.path.splitext(test_vector.filename)[1],
mimetype=test_vector.mimetype,
charset=test_vector.charset,
)
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(
stream,
stream_info=stream_info,
keep_data_uris=True,
url=test_vector.url
)
# Verify keep_data_uris related test conditions
for string in test_vector.must_include_with_data_uris:
assert string in result.markdown
for string in test_vector.must_not_include_with_data_uris:
assert string not in result.markdown
# Verify that basic test conditions are still met
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
if "data:image" in string:
continue
assert string not in result.markdown
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
@ -134,6 +200,8 @@ if __name__ == "__main__":
test_convert_stream_with_hints, test_convert_stream_with_hints,
test_convert_stream_without_hints, test_convert_stream_without_hints,
test_convert_url, test_convert_url,
test_convert_with_data_uris,
test_convert_stream_with_data_uris,
]: ]:
for test_vector in GENERAL_TEST_VECTORS: for test_vector in GENERAL_TEST_VECTORS:
print( print(