This commit is contained in:
Vijay Soni 2025-01-06 21:44:53 +01:00 committed by GitHub
commit c47f856250
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 81 additions and 3 deletions

View file

@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \ ffmpeg \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN pip install markitdown RUN pip install markitdown fastapi uvicorn
# Default USERID and GROUPID # Default USERID and GROUPID
ARG USERID=10000 ARG USERID=10000
@ -20,4 +20,4 @@ ARG GROUPID=10000
USER $USERID:$GROUPID USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ] ENTRYPOINT ["uvicorn", "src.markitdown.api:app", "--host", "0.0.0.0", "--port", "8000"]

View file

@ -69,6 +69,42 @@ print(result.text_content)
docker build -t markitdown:latest . docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
``` ```
### Web API
You can also use MarkItDown via a REST endpoint. The Web API is built using FastAPI and can be run using Docker.
#### Running the Web API
1. Build the Docker image:
```sh
docker build -t markitdown-api:latest .
```
2. Run the Docker container:
```sh
docker run --rm -p 8000:8000 markitdown-api:latest
```
The Web API will be available at `http://localhost:8000`.
#### Using the Web API
The Web API provides a single endpoint `/convert` that accepts a file and returns the converted markdown.
- **Endpoint:** `/convert`
- **Method:** `POST`
- **Request Body:** Multipart form data with a file field named `file`
- **Response:** JSON object with a `markdown` field containing the converted markdown
Example using `curl`:
```sh
curl -X POST "http://localhost:8000/convert" -F "file=@path-to-file.pdf"
```
<details> <details>
<summary>Batch Processing Multiple Files</summary> <summary>Batch Processing Multiple Files</summary>

View file

@ -42,6 +42,8 @@ dependencies = [
"pathvalidate", "pathvalidate",
"charset-normalizer", "charset-normalizer",
"openai", "openai",
"fastapi",
"uvicorn",
] ]
[project.urls] [project.urls]

View file

@ -6,6 +6,7 @@ import sys
from textwrap import dedent from textwrap import dedent
from .__about__ import __version__ from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult from ._markitdown import MarkItDown, DocumentConverterResult
import uvicorn
def main(): def main():
@ -57,9 +58,16 @@ def main():
"--output", "--output",
help="Output file name. If not provided, output is written to stdout.", help="Output file name. If not provided, output is written to stdout.",
) )
parser.add_argument(
"--api",
action="api",
help="Start the API server",
)
args = parser.parse_args() args = parser.parse_args()
if args.filename is None: if args.api:
uvicorn.run("src.markitdown.api:app", host="0.0.0.0", port=8000)
elif args.filename is None:
markitdown = MarkItDown() markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer) result = markitdown.convert_stream(sys.stdin.buffer)
_handle_output(args, result) _handle_output(args, result)

32
src/markitdown/api.py Normal file
View file

@ -0,0 +1,32 @@
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import FileResponse
from markitdown import MarkItDown
import os
app = FastAPI()
@app.post("/convert")
async def convert(file: UploadFile = File(...)):
if not file.filename:
raise HTTPException(status_code=400, detail="No file uploaded")
try:
contents = await file.read()
temp_file_path = f"/tmp/{file.filename}"
with open(temp_file_path, "wb") as temp_file:
temp_file.write(contents)
markitdown = MarkItDown()
result = markitdown.convert(temp_file_path)
# output_file_path = f"/tmp/{os.path.splitext(file.filename)[0]}.md"
# with open(output_file_path, "w") as output_file:
# output_file.write(result.text_content)
os.remove(temp_file_path)
# return FileResponse(output_file_path, filename=f"{os.path.splitext(file.filename)[0]}.md")
return {"markdown": result.text_content}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))