From da1007085c82c187fb57dbb0073776690757fe13 Mon Sep 17 00:00:00 2001 From: Brian Yang Date: Mon, 6 Jan 2025 00:45:58 -0500 Subject: [PATCH] Add API endpoints for file conversion --- Dockerfile | 4 ++-- README.md | 23 +++++++++++++++++++++++ src/markitdown/api.py | 21 +++++++++++++++++++++ tests/test_api.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 src/markitdown/api.py create mode 100644 tests/test_api.py diff --git a/Dockerfile b/Dockerfile index 0072d9e..3ca2fc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ && rm -rf /var/lib/apt/lists/* -RUN pip install markitdown +RUN pip install markitdown fastapi uvicorn # Default USERID and GROUPID ARG USERID=10000 @@ -20,4 +20,4 @@ ARG GROUPID=10000 USER $USERID:$GROUPID -ENTRYPOINT [ "markitdown" ] +ENTRYPOINT ["uvicorn", "markitdown.api:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index d2314c3..4b94e29 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,29 @@ result = md.convert("example.jpg") print(result.text_content) ``` +### RESTful API + +MarkItDown also provides a RESTful API using FastAPI. You can deploy the application on Vercel or another hosting service. + +#### Endpoints + +- `GET /`: Returns a welcome message. +- `POST /convert`: Converts an uploaded file to Markdown. + +#### Example Usage + +Using `curl`: + +```sh +curl -X POST "http:///convert" -F "file=@path-to-file.pdf" +``` + +Using `httpie`: + +```sh +http --form POST "http:///convert" file@path-to-file.pdf +``` + ### Docker ```sh diff --git a/src/markitdown/api.py b/src/markitdown/api.py new file mode 100644 index 0000000..daf0019 --- /dev/null +++ b/src/markitdown/api.py @@ -0,0 +1,21 @@ +from fastapi import FastAPI, HTTPException, UploadFile, File +from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException + +app = FastAPI() + +@app.get("/") +async def root(): + return {"message": "Welcome to the MarkItDown API"} + +@app.post("/convert") +async def convert(file: UploadFile = File(...)): + try: + markitdown = MarkItDown() + result = markitdown.convert_stream(file.file, file_extension=file.filename.split('.')[-1]) + return {"title": result.title, "text_content": result.text_content} + except UnsupportedFormatException: + raise HTTPException(status_code=400, detail="Unsupported file format") + except FileConversionException: + raise HTTPException(status_code=500, detail="File conversion error") + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..560b370 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,28 @@ +import pytest +from fastapi.testclient import TestClient +from markitdown.api import app + +client = TestClient(app) + +def test_root(): + response = client.get("/") + assert response.status_code == 200 + assert response.json() == {"message": "Welcome to the MarkItDown API"} + +def test_convert_success(): + with open("tests/test_files/test.docx", "rb") as file: + response = client.post("/convert", files={"file": file}) + assert response.status_code == 200 + assert "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation" in response.json()["text_content"] + +def test_convert_unsupported_format(): + with open("tests/test_files/test.unsupported", "rb") as file: + response = client.post("/convert", files={"file": file}) + assert response.status_code == 400 + assert response.json() == {"detail": "Unsupported file format"} + +def test_convert_conversion_error(): + with open("tests/test_files/test_corrupted.docx", "rb") as file: + response = client.post("/convert", files={"file": file}) + assert response.status_code == 500 + assert response.json() == {"detail": "File conversion error"}