This commit is contained in:
Raduan A. 2025-02-09 10:47:40 -06:00 committed by GitHub
commit bc5a57ec6e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 84 additions and 0 deletions

View file

@ -46,6 +46,13 @@ dependencies = [
"azure-identity"
]
[tool.uv]
dev-dependencies = [
"pytest>=7.0",
"pytest-asyncio>=0.23.0",
"black>=23.7.0",
]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"
Issues = "https://github.com/microsoft/markitdown/issues"

View file

@ -3,9 +3,11 @@
# SPDX-License-Identifier: MIT
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
from ._async_wrapper import AsyncMarkItDown
__all__ = [
"MarkItDown",
"AsyncMarkItDown",
"FileConversionException",
"UnsupportedFormatException",
]

View file

@ -0,0 +1,46 @@
"""Async wrapper for MarkItDown."""
import asyncio
from functools import partial
from typing import Optional, Union
from ._markitdown import MarkItDown, DocumentConverterResult
class AsyncMarkItDown:
"""Async wrapper for MarkItDown that runs operations in a thread pool."""
def __init__(self, markitdown: Optional[MarkItDown] = None):
"""Initialize the async wrapper.
Args:
markitdown: Optional MarkItDown instance to wrap. If not provided,
a new instance will be created.
"""
self._markitdown = markitdown or MarkItDown()
self._loop = asyncio.get_event_loop()
async def __aenter__(self):
"""Async context manager entry."""
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
pass
async def convert(self, file_path: str, **kwargs) -> DocumentConverterResult:
"""Convert a file to markdown asynchronously.
This runs the synchronous convert operation in a thread pool to avoid
blocking the event loop.
Args:
file_path: Path to the file to convert
**kwargs: Additional arguments to pass to the converter
Returns:
DocumentConverterResult containing the converted markdown
"""
# Run the synchronous convert in a thread pool
func = partial(self._markitdown.convert, file_path, **kwargs)
return await self._loop.run_in_executor(None, func)

View file

@ -0,0 +1,29 @@
#!/usr/bin/env python3 -m pytest
import os
import pytest
from markitdown import AsyncMarkItDown
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
DOCX_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
]
@pytest.mark.asyncio
async def test_async_markitdown_basic():
"""Test basic async functionality with a local file."""
async with AsyncMarkItDown() as markitdown:
result = await markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
# Verify the conversion worked as expected
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
if __name__ == "__main__":
pytest.main([__file__])