Use *kwarg to pass keep_data_uri para.
Add module cli vector tests
This commit is contained in:
parent
4899148310
commit
1eaa879b25
13 changed files with 159 additions and 35 deletions
|
|
@ -104,6 +104,12 @@ def main():
|
||||||
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
|
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--keep-data-uris",
|
||||||
|
action="store_true",
|
||||||
|
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("filename", nargs="?")
|
parser.add_argument("filename", nargs="?")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
@ -181,9 +187,9 @@ def main():
|
||||||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||||
|
|
||||||
if args.filename is None:
|
if args.filename is None:
|
||||||
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
|
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
|
||||||
else:
|
else:
|
||||||
result = markitdown.convert(args.filename, stream_info=stream_info)
|
result = markitdown.convert(args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
|
||||||
|
|
||||||
_handle_output(args, result)
|
_handle_output(args, result)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -79,9 +79,7 @@ class BingSerpConverter(DocumentConverter):
|
||||||
slug.extract()
|
slug.extract()
|
||||||
|
|
||||||
# Parse the algorithmic results
|
# Parse the algorithmic results
|
||||||
_markdownify = _CustomMarkdownify(
|
_markdownify = _CustomMarkdownify(**kwargs)
|
||||||
keep_data_uris=kwargs.get("keep_data_uris", False)
|
|
||||||
)
|
|
||||||
results = list()
|
results = list()
|
||||||
for result in soup.find_all(class_="b_algo"):
|
for result in soup.find_all(class_="b_algo"):
|
||||||
if not hasattr(result, "find_all"):
|
if not hasattr(result, "find_all"):
|
||||||
|
|
|
||||||
|
|
@ -74,5 +74,5 @@ class DocxConverter(HtmlConverter):
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
mammoth.convert_to_html(file_stream, style_map=style_map).value,
|
mammoth.convert_to_html(file_stream, style_map=style_map).value,
|
||||||
keep_data_uris=kwargs.get("keep_data_uris", False),
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -55,15 +55,10 @@ class HtmlConverter(DocumentConverter):
|
||||||
# Print only the main content
|
# Print only the main content
|
||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
webpage_text = ""
|
||||||
keep_data_uris = kwargs.get("keep_data_uris", False)
|
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify(
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||||
keep_data_uris=keep_data_uris
|
|
||||||
).convert_soup(body_elm)
|
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify(
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||||
keep_data_uris=keep_data_uris
|
|
||||||
).convert_soup(soup)
|
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
||||||
def __init__(self, **options: Any):
|
def __init__(self, **options: Any):
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||||
self.keep_data_uris = options.pop("keep_data_uris", False)
|
options["keep_data_uris"] = options.get("keep_data_uris", False)
|
||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
|
|
@ -102,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
return alt
|
return alt
|
||||||
|
|
||||||
# Remove dataURIs
|
# Remove dataURIs
|
||||||
if src.startswith("data:") and not self.keep_data_uris:
|
if src.startswith("data:") and not self.options["keep_data_uris"]:
|
||||||
src = src.split(",")[0] + "..."
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, src, title_part)
|
||||||
|
|
|
||||||
|
|
@ -78,9 +78,6 @@ class PptxConverter(DocumentConverter):
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get the keep_data_uris parameter
|
|
||||||
keep_data_uris = kwargs.get("keep_data_uris", False)
|
|
||||||
|
|
||||||
# Perform the conversion
|
# Perform the conversion
|
||||||
presentation = pptx.Presentation(file_stream)
|
presentation = pptx.Presentation(file_stream)
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
@ -144,7 +141,7 @@ class PptxConverter(DocumentConverter):
|
||||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# If keep_data_uris is True, use base64 encoding for images
|
# If keep_data_uris is True, use base64 encoding for images
|
||||||
if keep_data_uris:
|
if kwargs.get("keep_data_uris", False):
|
||||||
blob = shape.image.blob
|
blob = shape.image.blob
|
||||||
content_type = shape.image.content_type or "image/png"
|
content_type = shape.image.content_type or "image/png"
|
||||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
|
|
@ -156,7 +153,7 @@ class PptxConverter(DocumentConverter):
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
md_content += self._convert_table_to_markdown(shape.table)
|
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
||||||
|
|
||||||
# Charts
|
# Charts
|
||||||
if shape.has_chart:
|
if shape.has_chart:
|
||||||
|
|
@ -203,7 +200,7 @@ class PptxConverter(DocumentConverter):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _convert_table_to_markdown(self, table):
|
def _convert_table_to_markdown(self, table, **kwargs):
|
||||||
# Write the table as HTML, then convert it to Markdown
|
# Write the table as HTML, then convert it to Markdown
|
||||||
html_table = "<html><body><table>"
|
html_table = "<html><body><table>"
|
||||||
first_row = True
|
first_row = True
|
||||||
|
|
@ -218,7 +215,7 @@ class PptxConverter(DocumentConverter):
|
||||||
first_row = False
|
first_row = False
|
||||||
html_table += "</table></body></html>"
|
html_table += "</table></body></html>"
|
||||||
|
|
||||||
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
|
return self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n"
|
||||||
|
|
||||||
def _convert_chart_to_markdown(self, chart):
|
def _convert_chart_to_markdown(self, chart):
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -171,9 +171,7 @@ class RssConverter(DocumentConverter):
|
||||||
try:
|
try:
|
||||||
# using bs4 because many RSS feeds have HTML-styled content
|
# using bs4 because many RSS feeds have HTML-styled content
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
return _CustomMarkdownify(
|
return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
|
||||||
keep_data_uris=self._kwargs.get("keep_data_uris", False)
|
|
||||||
).convert_soup(soup)
|
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -76,13 +76,9 @@ class WikipediaConverter(DocumentConverter):
|
||||||
main_title = title_elm.string
|
main_title = title_elm.string
|
||||||
|
|
||||||
# Convert the page
|
# Convert the page
|
||||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
|
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||||
keep_data_uris=kwargs.get("keep_data_uris", False)
|
|
||||||
).convert_soup(body_elm)
|
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify(
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||||
keep_data_uris=kwargs.get("keep_data_uris", False)
|
|
||||||
).convert_soup(soup)
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=webpage_text,
|
markdown=webpage_text,
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,7 @@ class XlsxConverter(DocumentConverter):
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += (
|
md_content += (
|
||||||
self._html_converter.convert_string(html_content).markdown.strip()
|
self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
|
||||||
+ "\n\n"
|
+ "\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -146,7 +146,7 @@ class XlsConverter(DocumentConverter):
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += (
|
md_content += (
|
||||||
self._html_converter.convert_string(html_content).markdown.strip()
|
self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
|
||||||
+ "\n\n"
|
+ "\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,9 @@ class FileTestVector(object):
|
||||||
url: str | None
|
url: str | None
|
||||||
must_include: List[str]
|
must_include: List[str]
|
||||||
must_not_include: List[str]
|
must_not_include: List[str]
|
||||||
|
# in test keep_data_uris cases, we want to ensure that the data URIs are kept
|
||||||
|
must_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
|
||||||
|
must_not_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
GENERAL_TEST_VECTORS = [
|
GENERAL_TEST_VECTORS = [
|
||||||
|
|
@ -25,8 +28,17 @@ GENERAL_TEST_VECTORS = [
|
||||||
"# Abstract",
|
"# Abstract",
|
||||||
"# Introduction",
|
"# Introduction",
|
||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||||
|
"data:image/png;base64...",
|
||||||
|
],
|
||||||
|
must_not_include=[
|
||||||
|
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||||
|
],
|
||||||
|
must_include_with_data_uris=[
|
||||||
|
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||||
|
],
|
||||||
|
must_not_include_with_data_uris=[
|
||||||
|
"data:image/png;base64...",
|
||||||
],
|
],
|
||||||
must_not_include=[],
|
|
||||||
),
|
),
|
||||||
FileTestVector(
|
FileTestVector(
|
||||||
filename="test.xlsx",
|
filename="test.xlsx",
|
||||||
|
|
@ -65,8 +77,17 @@ GENERAL_TEST_VECTORS = [
|
||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||||
"2003", # chart value
|
"2003", # chart value
|
||||||
|
"![This phrase of the caption is Human-written.]", # image caption
|
||||||
|
],
|
||||||
|
must_not_include=[
|
||||||
|
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"
|
||||||
|
],
|
||||||
|
must_include_with_data_uris=[
|
||||||
|
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
|
||||||
|
],
|
||||||
|
must_not_include_with_data_uris=[
|
||||||
|
"",
|
||||||
],
|
],
|
||||||
must_not_include=[],
|
|
||||||
),
|
),
|
||||||
FileTestVector(
|
FileTestVector(
|
||||||
filename="test_outlook_msg.msg",
|
filename="test_outlook_msg.msg",
|
||||||
|
|
|
||||||
|
|
@ -149,6 +149,50 @@ def test_convert_url(shared_tmp_dir, test_vector):
|
||||||
assert test_string not in stdout
|
assert test_string not in stdout
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
||||||
|
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
|
||||||
|
"""Test CLI functionality when keep_data_uris is enabled"""
|
||||||
|
|
||||||
|
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"python",
|
||||||
|
"-m",
|
||||||
|
"markitdown",
|
||||||
|
"--keep-data-uris",
|
||||||
|
"-o",
|
||||||
|
output_file,
|
||||||
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||||
|
assert os.path.exists(output_file), f"Output file not created: {output_file}"
|
||||||
|
|
||||||
|
with open(output_file, "r") as f:
|
||||||
|
output_data = f.read()
|
||||||
|
for test_string in test_vector.must_include_with_data_uris:
|
||||||
|
assert test_string in output_data
|
||||||
|
for test_string in test_vector.must_not_include_with_data_uris:
|
||||||
|
assert test_string not in output_data
|
||||||
|
# Verify that basic test conditions are still met
|
||||||
|
for string in test_vector.must_include:
|
||||||
|
if "data:image" in string:
|
||||||
|
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||||
|
continue
|
||||||
|
assert string in output_data
|
||||||
|
for string in test_vector.must_not_include:
|
||||||
|
if "data:image" in string:
|
||||||
|
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||||
|
continue
|
||||||
|
assert string not in output_data
|
||||||
|
|
||||||
|
os.remove(output_file)
|
||||||
|
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
@ -161,6 +205,7 @@ if __name__ == "__main__":
|
||||||
test_output_to_file,
|
test_output_to_file,
|
||||||
test_input_from_stdin_without_hints,
|
test_input_from_stdin_without_hints,
|
||||||
test_convert_url,
|
test_convert_url,
|
||||||
|
test_output_to_file_with_data_uris,
|
||||||
]:
|
]:
|
||||||
for test_vector in CLI_TEST_VECTORS:
|
for test_vector in CLI_TEST_VECTORS:
|
||||||
print(
|
print(
|
||||||
|
|
|
||||||
BIN
packages/markitdown/tests/test_files/test.docx
vendored
BIN
packages/markitdown/tests/test_files/test.docx
vendored
Binary file not shown.
|
|
@ -124,6 +124,72 @@ def test_convert_url(test_vector):
|
||||||
assert string not in result.markdown
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||||
|
def test_convert_with_data_uris(test_vector):
|
||||||
|
"""Test API functionality when keep_data_uris is enabled"""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test local file conversion
|
||||||
|
result = markitdown.convert(
|
||||||
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||||
|
keep_data_uris=True,
|
||||||
|
url=test_vector.url
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify keep_data_uris related test conditions
|
||||||
|
for string in test_vector.must_include_with_data_uris:
|
||||||
|
assert string in result.markdown
|
||||||
|
for string in test_vector.must_not_include_with_data_uris:
|
||||||
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
# Verify that basic test conditions are still met
|
||||||
|
for string in test_vector.must_include:
|
||||||
|
if "data:image" in string:
|
||||||
|
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||||
|
continue
|
||||||
|
assert string in result.markdown
|
||||||
|
for string in test_vector.must_not_include:
|
||||||
|
if "data:image" in string:
|
||||||
|
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||||
|
continue
|
||||||
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||||
|
def test_convert_stream_with_data_uris(test_vector):
|
||||||
|
"""Test the conversion of a stream with no stream info."""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
stream_info = StreamInfo(
|
||||||
|
extension=os.path.splitext(test_vector.filename)[1],
|
||||||
|
mimetype=test_vector.mimetype,
|
||||||
|
charset=test_vector.charset,
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||||
|
result = markitdown.convert(
|
||||||
|
stream,
|
||||||
|
stream_info=stream_info,
|
||||||
|
keep_data_uris=True,
|
||||||
|
url=test_vector.url
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify keep_data_uris related test conditions
|
||||||
|
for string in test_vector.must_include_with_data_uris:
|
||||||
|
assert string in result.markdown
|
||||||
|
for string in test_vector.must_not_include_with_data_uris:
|
||||||
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
# Verify that basic test conditions are still met
|
||||||
|
for string in test_vector.must_include:
|
||||||
|
assert string in result.markdown
|
||||||
|
for string in test_vector.must_not_include:
|
||||||
|
# Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
|
||||||
|
if "data:image" in string:
|
||||||
|
continue
|
||||||
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
@ -134,6 +200,8 @@ if __name__ == "__main__":
|
||||||
test_convert_stream_with_hints,
|
test_convert_stream_with_hints,
|
||||||
test_convert_stream_without_hints,
|
test_convert_stream_without_hints,
|
||||||
test_convert_url,
|
test_convert_url,
|
||||||
|
test_convert_with_data_uris,
|
||||||
|
test_convert_stream_with_data_uris,
|
||||||
]:
|
]:
|
||||||
for test_vector in GENERAL_TEST_VECTORS:
|
for test_vector in GENERAL_TEST_VECTORS:
|
||||||
print(
|
print(
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue