Merge branch 'main' into doc_intel_keycred

2025-03-26 09:31:36 -07:00 · 2025-03-26 09:31:36 -07:00 · 7df6d6cfc2
commit 7df6d6cfc2
parent c9f53ef63a 9a951055f0
15 changed files with 585 additions and 25 deletions
--- a/README.md
+++ b/README.md
@ -4,6 +4,9 @@
 ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
 [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
 > [!TIP]
 > MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
 > [!IMPORTANT]
 > Breaking changes between 0.0.1 to 0.1.0:
 > * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. 
--- a/packages/markitdown-mcp/Dockerfile
+++ b/packages/markitdown-mcp/Dockerfile
@ -0,0 +1,26 @@
 FROM python:3.13-slim-bullseye
 ENV DEBIAN_FRONTEND=noninteractive
 ENV EXIFTOOL_PATH=/usr/bin/exiftool
 ENV FFMPEG_PATH=/usr/bin/ffmpeg
 # Runtime dependency
 RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    exiftool
 # Cleanup
 RUN rm -rf /var/lib/apt/lists/*
 COPY . /app
 RUN pip --no-cache-dir install /app
 WORKDIR /workdir
 # Default USERID and GROUPID
 ARG USERID=nobody
 ARG GROUPID=nogroup
 USER $USERID:$GROUPID
 ENTRYPOINT [ "markitdown-mcp" ]
--- a/packages/markitdown-mcp/README.md
+++ b/packages/markitdown-mcp/README.md
@ -0,0 +1,134 @@
 # MarkItDown-MCP
 [![PyPI](https://img.shields.io/pypi/v/markitdown-mcp.svg)](https://pypi.org/project/markitdown-mcp/)
 ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-mcp)
 [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
 The `markitdown-mcp` package provides a lightweight STDIO and SSE MCP server for calling MarkItDown.
 It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `https:`, `file:`, or `data:` URI.
 ## Installation
 To install the package, use pip:
 ```bash
 pip install markitdown-mcp
 ```
 ## Usage
 To run the MCP server, ussing STDIO (default) use the following command:
 ```bash	
 markitdown-mcp
 ```
 To run the MCP server, ussing SSE use the following command:
 ```bash	
 markitdown-mcp --sse --host 127.0.0.1 --port 3001
 ```
 ## Running in Docker
 To run `markitdown-mcp` in Docker, build the Docker image using the provided Dockerfile:
 ```bash
 docker build -t markitdown-mcp:latest .
 ```
 And run it using:
 ```bash
 docker run -it --rm markitdown-mcp:latest
 ```
 This will be sufficient for remote URIs. To access local files, you need to mount the local directory into the container. For example, if you want to access files in `/home/user/data`, you can run:
 ```bash
 docker run -it --rm -v /home/user/data:/workdir markitdown-mcp:latest
 ```
 Once mounted, all files under data will be accessible under `/workdir` in the container. For example, if you have a file `example.txt` in `/home/user/data`, it will be accessible in the container at `/workdir/example.txt`.
 ## Accessing from Claude Desktop
 It is recommended to use the Docker image when running the MCP server for Claude Desktop.
 Follow [these instrutions](https://modelcontextprotocol.io/quickstart/user#for-claude-desktop-users) to access Claude's `claude_desktop_config.json` file.
 Edit it to include the following JSON entry:
 ```json
 {
  "mcpServers": {
    "markitdown": {
      "command": "docker",
      "args": [
        "run",
        "--rm",
        "-i",
        "markitdown-mcp:latest"
      ]
    }
  }
 }
 ```
 If you want to mount a directory, adjust it accordingly:
 ```json
 {
  "mcpServers": {
    "markitdown": {
      "command": "docker",
      "args": [
 	"run",
 	"--rm",
 	"-i",
 	"-v",
 	"/home/user/data:/workdir",
 	"markitdown-mcp:latest"
      ]
    }
  }
 }
 ```
 ## Debugging
 To debug the MCP server you can use the `mcpinspector` tool.
 ```bash
 npx @modelcontextprotocol/inspector
 ```
 You can then connect to the insepctor through the specified host and port (e.g., `http://localhost:5173/`).
 If using STDIO:
 * select `STDIO` as the transport type,
 * input `markitdown-mcp` as the command, and
 * click `Connect`
 If using SSE:
 * select `SSE` as the transport type,
 * input `http://127.0.0.1:3001/sse` as the URL, and
 * click `Connect`
 Finally:
 * click the `Tools` tab,
 * click `List Tools`,
 * click `convert_to_markdown`, and
 * run the tool on any valid URI.
 ## Security Considerations
 The server does not support authentication, and runs with the privileges if the user running it. For this reason, when running in SSE mode, it is recommended to run the server bound to `localhost` (default).
 ## Trademarks
 This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
 trademarks or logos is subject to and must follow
 [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
 Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
 Any use of third-party trademarks or logos are subject to those third-party's policies.
--- a/packages/markitdown-mcp/pyproject.toml
+++ b/packages/markitdown-mcp/pyproject.toml
@ -0,0 +1,69 @@
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [project]
 name = "markitdown-mcp"
 dynamic = ["version"]
 description = 'An MCP server for the "markitdown" library.'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
 keywords = []
 authors = [
  { name = "Adam Fourney", email = "adamfo@microsoft.com" },
 ]
 classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: 3.13",
  "Programming Language :: Python :: Implementation :: CPython",
  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
  "mcp~=1.5.0",
  "markitdown[all]>=0.1.1,<0.2.0",
 ]
 [project.urls]
 Documentation = "https://github.com/microsoft/markitdown#readme"
 Issues = "https://github.com/microsoft/markitdown/issues"
 Source = "https://github.com/microsoft/markitdown"
 [tool.hatch.version]
 path = "src/markitdown_mcp/__about__.py"
 [project.scripts]
 markitdown-mcp = "markitdown_mcp.__main__:main"
 [tool.hatch.envs.types]
 extra-dependencies = [
  "mypy>=1.0.0",
 ]
 [tool.hatch.envs.types.scripts]
 check = "mypy --install-types --non-interactive {args:src/markitdown_mcp tests}"
 [tool.coverage.run]
 source_pkgs = ["markitdown-mcp", "tests"]
 branch = true
 parallel = true
 omit = [
  "src/markitdown_mcp/__about__.py",
 ]
 [tool.coverage.paths]
 markitdown-mcp = ["src/markitdown_mcp", "*/markitdown-mcp/src/markitdown_mcp"]
 tests = ["tests", "*/markitdown-mcp/tests"]
 [tool.coverage.report]
 exclude_lines = [
  "no cov",
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
 [tool.hatch.build.targets.sdist]
 only-include = ["src/markitdown_mcp"]
--- a/packages/markitdown-mcp/src/markitdown_mcp/about.py
+++ b/packages/markitdown-mcp/src/markitdown_mcp/about.py
@ -0,0 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 __version__ = "0.0.1a3"
--- a/packages/markitdown-mcp/src/markitdown_mcp/init.py
+++ b/packages/markitdown-mcp/src/markitdown_mcp/init.py
@ -0,0 +1,9 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 from .__about__ import __version__
 __all__ = [
    "__version__",
 ]
--- a/packages/markitdown-mcp/src/markitdown_mcp/main.py
+++ b/packages/markitdown-mcp/src/markitdown_mcp/main.py
@ -0,0 +1,83 @@
 import sys
 from typing import Any
 from mcp.server.fastmcp import FastMCP
 from starlette.applications import Starlette
 from mcp.server.sse import SseServerTransport
 from starlette.requests import Request
 from starlette.routing import Mount, Route
 from mcp.server import Server
 from markitdown import MarkItDown
 import uvicorn
 # Initialize FastMCP server for MarkItDown (SSE)
 mcp = FastMCP("markitdown")
@mcp.tool()
 async def convert_to_markdown(uri: str) -> str:
    """Convert a resource described by an http:, https:, file: or data: URI to markdown"""
    return MarkItDown().convert_uri(uri).markdown
 def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette:
    sse = SseServerTransport("/messages/")
    async def handle_sse(request: Request) -> None:
        async with sse.connect_sse(
            request.scope,
            request.receive,
            request._send,
        ) as (read_stream, write_stream):
            await mcp_server.run(
                read_stream,
                write_stream,
                mcp_server.create_initialization_options(),
            )
    return Starlette(
        debug=debug,
        routes=[
            Route("/sse", endpoint=handle_sse),
            Mount("/messages/", app=sse.handle_post_message),
        ],
    )
 # Main entry point
 def main():
    import argparse
    mcp_server = mcp._mcp_server
    parser = argparse.ArgumentParser(description="Run MCP SSE-based MarkItDown server")
    parser.add_argument(
        "--sse",
        action="store_true",
        help="Run the server with SSE transport rather than STDIO (default: False)",
    )
    parser.add_argument(
        "--host", default=None, help="Host to bind to (default: 127.0.0.1)"
    )
    parser.add_argument(
        "--port", type=int, default=None, help="Port to listen on (default: 3001)"
    )
    args = parser.parse_args()
    if not args.sse and (args.host or args.port):
        parser.error("Host and port arguments are only valid when using SSE transport.")
        sys.exit(1)
    if args.sse:
        starlette_app = create_starlette_app(mcp_server, debug=True)
        uvicorn.run(
            starlette_app,
            host=args.host if args.host else "127.0.0.1",
            port=args.port if args.port else 3001,
        )
    else:
        mcp.run()
 if __name__ == "__main__":
    main()
--- a/packages/markitdown-mcp/src/markitdown_mcp/py.typed
+++ b/packages/markitdown-mcp/src/markitdown_mcp/py.typed
--- a/packages/markitdown-mcp/tests/init.py
+++ b/packages/markitdown-mcp/tests/init.py
@ -0,0 +1,3 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
--- a/packages/markitdown-sample-plugin/README.md
+++ b/packages/markitdown-sample-plugin/README.md
@ -1,7 +1,7 @@
 # MarkItDown Sample Plugin
-[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
+[![PyPI](https://img.shields.io/pypi/v/markitdown-sample-plugin.svg)](https://pypi.org/project/markitdown-sample-plugin/)
-![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
+![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-sample-plugin)
 [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0"
+__version__ = "0.1.1"
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -20,6 +20,7 @@ import charset_normalizer
 import codecs
 from ._stream_info import StreamInfo
 from ._uri_utils import parse_data_uri, file_uri_to_path
 from .converters import (
    PlainTextConverter,
@ -253,9 +254,10 @@ class MarkItDown:
        # Local path or url
        if isinstance(source, str):
            if (
-                source.startswith("http://")
+                source.startswith("http:")
-                or source.startswith("https://")
+                or source.startswith("https:")
-                or source.startswith("file://")
+                or source.startswith("file:")
                or source.startswith("data:")
            ):
                # Rename the url argument to mock_url
                # (Deprecated -- use stream_info)
@ -264,7 +266,7 @@ class MarkItDown:
                    _kwargs["mock_url"] = _kwargs["url"]
                    del _kwargs["url"]
-                return self.convert_url(source, stream_info=stream_info, **_kwargs)
+                return self.convert_uri(source, stream_info=stream_info, **_kwargs)
            else:
                return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Path object
@ -374,22 +376,80 @@ class MarkItDown:
        url: str,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,
        mock_url: Optional[str] = None,
        **kwargs: Any,
    ) -> DocumentConverterResult:
        """Alias for convert_uri()"""
        # convert_url will likely be deprecated in the future in favor of convert_uri
        return self.convert_uri(
            url,
            stream_info=stream_info,
            file_extension=file_extension,
            mock_url=mock_url,
            **kwargs,
        )
    def convert_uri(
        self,
        uri: str,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        mock_url: Optional[
            str
        ] = None,  # Mock the request as if it came from a different URL
        **kwargs: Any,
-    ) -> DocumentConverterResult:  # TODO: fix kwargs type
+    ) -> DocumentConverterResult:
-        # Send a HTTP request to the URL
+        uri = uri.strip()
-        response = self._requests_session.get(url, stream=True)
+
-        response.raise_for_status()
+        # File URIs
-        return self.convert_response(
+        if uri.startswith("file:"):
-            response,
+            netloc, path = file_uri_to_path(uri)
-            stream_info=stream_info,
+            if netloc and netloc != "localhost":
-            file_extension=file_extension,
+                raise ValueError(
-            url=mock_url,
+                    f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
-            **kwargs,
+                )
-        )
+            return self.convert_local(
                path,
                stream_info=stream_info,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        # Data URIs
        elif uri.startswith("data:"):
            mimetype, attributes, data = parse_data_uri(uri)
            base_guess = StreamInfo(
                mimetype=mimetype,
                charset=attributes.get("charset"),
            )
            if stream_info is not None:
                base_guess = base_guess.copy_and_update(stream_info)
            return self.convert_stream(
                io.BytesIO(data),
                stream_info=base_guess,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        # HTTP/HTTPS URIs
        elif uri.startswith("http:") or uri.startswith("https:"):
            response = self._requests_session.get(uri, stream=True)
            response.raise_for_status()
            return self.convert_response(
                response,
                stream_info=stream_info,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        else:
            raise ValueError(
                f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
            )
    def convert_response(
        self,
--- a/packages/markitdown/src/markitdown/_uri_utils.py
+++ b/packages/markitdown/src/markitdown/_uri_utils.py
@ -0,0 +1,52 @@
 import base64
 import os
 from typing import Tuple, Dict
 from urllib.request import url2pathname
 from urllib.parse import urlparse, unquote_to_bytes
 def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
    """Convert a file URI to a local file path"""
    parsed = urlparse(file_uri)
    if parsed.scheme != "file":
        raise ValueError(f"Not a file URL: {file_uri}")
    netloc = parsed.netloc if parsed.netloc else None
    path = os.path.abspath(url2pathname(parsed.path))
    return netloc, path
 def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
    if not uri.startswith("data:"):
        raise ValueError("Not a data URI")
    header, _, data = uri.partition(",")
    if not _:
        raise ValueError("Malformed data URI, missing ',' separator")
    meta = header[5:]  # Strip 'data:'
    parts = meta.split(";")
    is_base64 = False
    # Ends with base64?
    if parts[-1] == "base64":
        parts.pop()
        is_base64 = True
    mime_type = None  # Normally this would default to text/plain but we won't assume
    if len(parts) and len(parts[0]) > 0:
        # First part is the mime type
        mime_type = parts.pop(0)
    attributes: Dict[str, str] = {}
    for part in parts:
        # Handle key=value pairs in the middle
        if "=" in part:
            key, value = part.split("=", 1)
            attributes[key] = value
        elif len(part) > 0:
            attributes[part] = ""
    content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
    return mime_type, attributes, content
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@ -5,6 +5,8 @@ import shutil
 import openai
 import pytest
 from markitdown._uri_utils import parse_data_uri, file_uri_to_path
 from markitdown import (
    MarkItDown,
    UnsupportedFormatException,
@ -176,6 +178,79 @@ def test_stream_info_operations() -> None:
    assert updated_stream_info.url == "url.1"
 def test_data_uris() -> None:
    # Test basic parsing of data URIs
    data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type is None
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 1
    assert attributes["charset"] == "utf-8"
    assert data == b"Hello, World!"
    data_uri = "data:,Hello%2C%20World%21"
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type is None
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:text/plain,Hello%2C%20World%21"
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 1
    assert attributes["charset"] == "utf-8"
    assert data == b"Hello, World!"
 def test_file_uris() -> None:
    # Test file URI with an empty host
    file_uri = "file:///path/to/file.txt"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
    # Test file URI with no host
    file_uri = "file:/path/to/file.txt"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
    # Test file URI with localhost
    file_uri = "file://localhost/path/to/file.txt"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc == "localhost"
    assert path == "/path/to/file.txt"
    # Test file URI with query parameters
    file_uri = "file:///path/to/file.txt?param=value"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
    # Test file URI with fragment
    file_uri = "file:///path/to/file.txt#fragment"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
 def test_docx_comments() -> None:
    markitdown = MarkItDown()
@ -314,6 +389,8 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    for test in [
        test_stream_info_operations,
        test_data_uris,
        test_file_uris,
        test_docx_comments,
        test_input_as_strings,
        test_markitdown_remote,
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@ -3,7 +3,9 @@ import os
 import time
 import pytest
 import codecs
 import base64
 from pathlib import Path
 if __name__ == "__main__":
    from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
@ -108,8 +110,8 @@ def test_convert_stream_without_hints(test_vector):
    reason="do not run tests that query external urls",
 )
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
-def test_convert_url(test_vector):
+def test_convert_http_uri(test_vector):
-    """Test the conversion of a stream with no stream info."""
+    """Test the conversion of an HTTP:// or HTTPS:// URI."""
    markitdown = MarkItDown()
    time.sleep(1)  # Ensure we don't hit rate limits
@ -124,8 +126,44 @@ def test_convert_url(test_vector):
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 def test_convert_file_uri(test_vector):
    """Test the conversion of a file:// URI."""
    markitdown = MarkItDown()
    result = markitdown.convert(
        Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
        url=test_vector.url,
    )
    for string in test_vector.must_include:
        assert string in result.markdown
    for string in test_vector.must_not_include:
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 def test_convert_data_uri(test_vector):
    """Test the conversion of a data URI."""
    markitdown = MarkItDown()
    data = ""
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        data = base64.b64encode(stream.read()).decode("utf-8")
    mimetype = test_vector.mimetype
    data_uri = f"data:{mimetype};base64,{data}"
    result = markitdown.convert(
        data_uri,
        url=test_vector.url,
    )
    for string in test_vector.must_include:
        assert string in result.markdown
    for string in test_vector.must_not_include:
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
-def test_convert_with_data_uris(test_vector):
+def test_convert_keep_data_uris(test_vector):
    """Test API functionality when keep_data_uris is enabled"""
    markitdown = MarkItDown()
@ -143,7 +181,7 @@ def test_convert_with_data_uris(test_vector):
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
-def test_convert_stream_with_data_uris(test_vector):
+def test_convert_stream_keep_data_uris(test_vector):
    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()
@ -175,7 +213,9 @@ if __name__ == "__main__":
        test_convert_local,
        test_convert_stream_with_hints,
        test_convert_stream_without_hints,
-        test_convert_url,
+        test_convert_http_uri,
        test_convert_file_uri,
        test_convert_data_uri,
    ]:
        for test_vector in GENERAL_TEST_VECTORS:
            print(
@ -186,8 +226,8 @@ if __name__ == "__main__":
    # Data URI tests
    for test_function in [
-        test_convert_with_data_uris,
+        test_convert_keep_data_uris,
-        test_convert_stream_with_data_uris,
+        test_convert_stream_keep_data_uris,
    ]:
        for test_vector in DATA_URI_TEST_VECTORS:
            print(