Use *kwarg to pass keep_data_uri para.

Add module cli vector tests
2025-03-21 00:49:36 +08:00 · 2025-03-21 00:49:36 +08:00 · 1eaa879b25
commit 1eaa879b25
parent 4899148310
13 changed files with 159 additions and 35 deletions
--- a/packages/markitdown/src/markitdown/main.py
+++ b/packages/markitdown/src/markitdown/main.py
@ -104,6 +104,12 @@ def main():
        help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
    )
    parser.add_argument(
        "--keep-data-uris",
        action="store_true",
        help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
    )
    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()
@ -181,9 +187,9 @@ def main():
        markitdown = MarkItDown(enable_plugins=args.use_plugins)
    if args.filename is None:
-        result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
+        result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
    else:
-        result = markitdown.convert(args.filename, stream_info=stream_info)
+        result = markitdown.convert(args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris)
    _handle_output(args, result)
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -79,9 +79,7 @@ class BingSerpConverter(DocumentConverter):
            slug.extract()
        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify(
+        _markdownify = _CustomMarkdownify(**kwargs)
            keep_data_uris=kwargs.get("keep_data_uris", False)
        )
        results = list()
        for result in soup.find_all(class_="b_algo"):
            if not hasattr(result, "find_all"):
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -74,5 +74,5 @@ class DocxConverter(HtmlConverter):
        style_map = kwargs.get("style_map", None)
        return self._html_converter.convert_string(
            mammoth.convert_to_html(file_stream, style_map=style_map).value,
-            keep_data_uris=kwargs.get("keep_data_uris", False),
+            **kwargs
        )
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -55,15 +55,10 @@ class HtmlConverter(DocumentConverter):
        # Print only the main content
        body_elm = soup.find("body")
        webpage_text = ""
        keep_data_uris = kwargs.get("keep_data_uris", False)
        if body_elm:
-            webpage_text = _CustomMarkdownify(
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
                keep_data_uris=keep_data_uris
            ).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify(
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
                keep_data_uris=keep_data_uris
            ).convert_soup(soup)
        assert isinstance(webpage_text, str)
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@ -17,7 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
-        self.keep_data_uris = options.pop("keep_data_uris", False)
+        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
@ -102,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            return alt
        # Remove dataURIs
-        if src.startswith("data:") and not self.keep_data_uris:
+        if src.startswith("data:") and not self.options["keep_data_uris"]:
            src = src.split(",")[0] + "..."
        return "![%s](%s%s)" % (alt, src, title_part)
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -78,9 +78,6 @@ class PptxConverter(DocumentConverter):
                _dependency_exc_info[2]
            )
        # Get the keep_data_uris parameter
        keep_data_uris = kwargs.get("keep_data_uris", False)
        # Perform the conversion
        presentation = pptx.Presentation(file_stream)
        md_content = ""
@ -144,7 +141,7 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()
                    # If keep_data_uris is True, use base64 encoding for images
-                    if keep_data_uris:
+                    if kwargs.get("keep_data_uris", False):
                        blob = shape.image.blob
                        content_type = shape.image.content_type or "image/png"
                        b64_string = base64.b64encode(blob).decode("utf-8")
@ -156,7 +153,7 @@ class PptxConverter(DocumentConverter):
                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table)
+                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
                # Charts
                if shape.has_chart:
@ -203,7 +200,7 @@ class PptxConverter(DocumentConverter):
            return True
        return False
-    def _convert_table_to_markdown(self, table):
+    def _convert_table_to_markdown(self, table, **kwargs):
        # Write the table as HTML, then convert it to Markdown
        html_table = "<html><body><table>"
        first_row = True
@ -218,7 +215,7 @@ class PptxConverter(DocumentConverter):
            first_row = False
        html_table += "</table></body></html>"
-        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
+        return self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n"
    def _convert_chart_to_markdown(self, chart):
        try:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -171,9 +171,7 @@ class RssConverter(DocumentConverter):
        try:
            # using bs4 because many RSS feeds have HTML-styled content
            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify(
+            return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
                keep_data_uris=self._kwargs.get("keep_data_uris", False)
            ).convert_soup(soup)
        except BaseException as _:
            return content
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -76,13 +76,9 @@ class WikipediaConverter(DocumentConverter):
                main_title = title_elm.string
            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(**kwargs).convert_soup(body_elm)
                keep_data_uris=kwargs.get("keep_data_uris", False)
            ).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify(
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
                keep_data_uris=kwargs.get("keep_data_uris", False)
            ).convert_soup(soup)
        return DocumentConverterResult(
            markdown=webpage_text,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -86,7 +86,7 @@ class XlsxConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(html_content).markdown.strip()
+                self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
                + "\n\n"
            )
@ -146,7 +146,7 @@ class XlsConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(html_content).markdown.strip()
+                self._html_converter.convert_string(html_content, **kwargs).markdown.strip()
                + "\n\n"
            )
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@ -10,6 +10,9 @@ class FileTestVector(object):
    url: str | None
    must_include: List[str]
    must_not_include: List[str]
    # in test keep_data_uris cases, we want to ensure that the data URIs are kept
    must_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
    must_not_include_with_data_uris: List[str] = dataclasses.field(default_factory=list)
 GENERAL_TEST_VECTORS = [
@ -25,8 +28,17 @@ GENERAL_TEST_VECTORS = [
            "# Abstract",
            "# Introduction",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "data:image/png;base64...",
        ],
        must_not_include=[
            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
        must_include_with_data_uris=[
            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
        must_not_include_with_data_uris=[
            "data:image/png;base64...",
        ],
        must_not_include=[],
    ),
    FileTestVector(
        filename="test.xlsx",
@ -65,8 +77,17 @@ GENERAL_TEST_VECTORS = [
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
            "2003",  # chart value
            "![This phrase of the caption is Human-written.]",  # image caption
        ],
        must_not_include=[
            "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"
        ],
        must_include_with_data_uris=[
            "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
        ],
        must_not_include_with_data_uris=[
            "![This phrase of the caption is Human-written.](Picture4.jpg)",
        ],
        must_not_include=[],
    ),
    FileTestVector(
        filename="test_outlook_msg.msg",
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@ -149,6 +149,50 @@ def test_convert_url(shared_tmp_dir, test_vector):
        assert test_string not in stdout
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
 def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
    """Test CLI functionality when keep_data_uris is enabled"""
    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
    result = subprocess.run(
        [
            "python",
            "-m",
            "markitdown",
            "--keep-data-uris",
            "-o",
            output_file,
            os.path.join(TEST_FILES_DIR, test_vector.filename),
        ],
        capture_output=True,
        text=True,
    )
    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
    assert os.path.exists(output_file), f"Output file not created: {output_file}"
    with open(output_file, "r") as f:
        output_data = f.read()
        for test_string in test_vector.must_include_with_data_uris:
            assert test_string in output_data
        for test_string in test_vector.must_not_include_with_data_uris:
            assert test_string not in output_data
        # Verify that basic test conditions are still met
        for string in test_vector.must_include:
            if "data:image" in string:
                # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
                continue
            assert string in output_data
        for string in test_vector.must_not_include:
            if "data:image" in string:
                # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
                continue
            assert string not in output_data
    os.remove(output_file)
    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
 if __name__ == "__main__":
    import sys
    import tempfile
@ -161,6 +205,7 @@ if __name__ == "__main__":
            test_output_to_file,
            test_input_from_stdin_without_hints,
            test_convert_url,
            test_output_to_file_with_data_uris,
        ]:
            for test_vector in CLI_TEST_VECTORS:
                print(
--- a/packages/markitdown/tests/test_files/test.docx
+++ b/packages/markitdown/tests/test_files/test.docx
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@ -124,6 +124,72 @@ def test_convert_url(test_vector):
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 def test_convert_with_data_uris(test_vector):
    """Test API functionality when keep_data_uris is enabled"""
    markitdown = MarkItDown()
    # Test local file conversion
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, test_vector.filename),
        keep_data_uris=True,
        url=test_vector.url
    )
    # Verify keep_data_uris related test conditions
    for string in test_vector.must_include_with_data_uris:
        assert string in result.markdown
    for string in test_vector.must_not_include_with_data_uris:
        assert string not in result.markdown
    # Verify that basic test conditions are still met
    for string in test_vector.must_include:
        if "data:image" in string:
            # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
            continue
        assert string in result.markdown
    for string in test_vector.must_not_include:
        if "data:image" in string:
            # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
            continue
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 def test_convert_stream_with_data_uris(test_vector):
    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()
    stream_info = StreamInfo(
        extension=os.path.splitext(test_vector.filename)[1],
        mimetype=test_vector.mimetype,
        charset=test_vector.charset,
    )
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        result = markitdown.convert(
            stream,
            stream_info=stream_info,
            keep_data_uris=True,
            url=test_vector.url
        )
        # Verify keep_data_uris related test conditions
        for string in test_vector.must_include_with_data_uris:
            assert string in result.markdown
        for string in test_vector.must_not_include_with_data_uris:
            assert string not in result.markdown
        # Verify that basic test conditions are still met
        for string in test_vector.must_include:
            assert string in result.markdown
        for string in test_vector.must_not_include:
            # Skip data:image related tests (originally we truncate images and don't want to include data:image; but now we want to include data:image)
            if "data:image" in string:
                continue
            assert string not in result.markdown
 if __name__ == "__main__":
    import sys
@ -134,6 +200,8 @@ if __name__ == "__main__":
        test_convert_stream_with_hints,
        test_convert_stream_without_hints,
        test_convert_url,
        test_convert_with_data_uris,
        test_convert_stream_with_data_uris,
    ]:
        for test_vector in GENERAL_TEST_VECTORS:
            print(