diff --git a/pyproject.toml b/pyproject.toml index 2a4e203..61965a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,13 @@ dependencies = [ "azure-identity" ] +[tool.uv] +dev-dependencies = [ + "pytest>=7.0", + "pytest-asyncio>=0.23.0", + "black>=23.7.0", +] + [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" Issues = "https://github.com/microsoft/markitdown/issues" diff --git a/src/markitdown/__init__.py b/src/markitdown/__init__.py index 482f428..94b0c0a 100644 --- a/src/markitdown/__init__.py +++ b/src/markitdown/__init__.py @@ -3,9 +3,11 @@ # SPDX-License-Identifier: MIT from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException +from ._async_wrapper import AsyncMarkItDown __all__ = [ "MarkItDown", + "AsyncMarkItDown", "FileConversionException", "UnsupportedFormatException", ] diff --git a/src/markitdown/_async_wrapper.py b/src/markitdown/_async_wrapper.py new file mode 100644 index 0000000..5efc9ba --- /dev/null +++ b/src/markitdown/_async_wrapper.py @@ -0,0 +1,46 @@ +"""Async wrapper for MarkItDown.""" + +import asyncio +from functools import partial +from typing import Optional, Union + +from ._markitdown import MarkItDown, DocumentConverterResult + + +class AsyncMarkItDown: + """Async wrapper for MarkItDown that runs operations in a thread pool.""" + + def __init__(self, markitdown: Optional[MarkItDown] = None): + """Initialize the async wrapper. + + Args: + markitdown: Optional MarkItDown instance to wrap. If not provided, + a new instance will be created. + """ + self._markitdown = markitdown or MarkItDown() + self._loop = asyncio.get_event_loop() + + async def __aenter__(self): + """Async context manager entry.""" + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + pass + + async def convert(self, file_path: str, **kwargs) -> DocumentConverterResult: + """Convert a file to markdown asynchronously. + + This runs the synchronous convert operation in a thread pool to avoid + blocking the event loop. + + Args: + file_path: Path to the file to convert + **kwargs: Additional arguments to pass to the converter + + Returns: + DocumentConverterResult containing the converted markdown + """ + # Run the synchronous convert in a thread pool + func = partial(self._markitdown.convert, file_path, **kwargs) + return await self._loop.run_in_executor(None, func) diff --git a/tests/test_async_markitdown.py b/tests/test_async_markitdown.py new file mode 100644 index 0000000..7d804ec --- /dev/null +++ b/tests/test_async_markitdown.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 -m pytest +import os +import pytest + +from markitdown import AsyncMarkItDown + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + +DOCX_TEST_STRINGS = [ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", +] + +@pytest.mark.asyncio +async def test_async_markitdown_basic(): + """Test basic async functionality with a local file.""" + async with AsyncMarkItDown() as markitdown: + result = await markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) + + # Verify the conversion worked as expected + for test_string in DOCX_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file