This commit is contained in:
Hemanth HM 2025-01-03 23:27:05 +00:00 committed by GitHub
commit 7548720917
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 76 additions and 0 deletions

View file

@ -72,6 +72,23 @@ print(result.text_content)
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
### Serve Command
You can start a CORS-enabled Flask server to convert files to markdown using the `serve` command:
```sh
markitdown serve
```
The server will be available at `http://localhost:5000`. You can send a POST request to the `/convert` endpoint with a file to convert it to markdown.
Example using `curl`:
```sh
curl -X POST -F 'file=@path-to-file.pdf' http://localhost:5000/convert
```
<details>
<summary>Batch Processing Multiple Files</summary>

View file

@ -42,6 +42,8 @@ dependencies = [
"pathvalidate",
"charset-normalizer",
"openai",
"flask",
"flask-cors",
]
[project.urls]
@ -54,6 +56,7 @@ path = "src/markitdown/__about__.py"
[project.scripts]
markitdown = "markitdown.__main__:main"
serve = "markitdown.server:app.run"
[tool.hatch.envs.types]
extra-dependencies = [

24
src/markitdown/server.py Normal file
View file

@ -0,0 +1,24 @@
from flask import Flask, request, jsonify
from flask_cors import CORS
from markitdown import MarkItDown
app = Flask(__name__)
CORS(app)
markitdown = MarkItDown()
@app.route('/convert', methods=['POST'])
def convert():
if 'file' in request.files:
file = request.files['file']
result = markitdown.convert(file.stream, file_extension=file.filename.split('.')[-1])
return jsonify({'content': result.text_content})
elif 'url' in request.form:
url = request.form['url']
result = markitdown.convert(url)
return jsonify({'content': result.text_content})
else:
return jsonify({'error': 'No file or URL provided'}), 400
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)

View file

@ -145,6 +145,11 @@ LLM_TEST_STRINGS = [
"5bda1dd6",
]
# New test strings for the serve command
SERVE_TEST_STRINGS = [
"While there is contemporaneous exploration of multi-agent approaches"
]
# --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None):
@ -330,6 +335,32 @@ def test_markitdown_llm() -> None:
assert test_string in result.text_content.lower()
# New test for the serve command
def test_markitdown_serve() -> None:
from src.markitdown.server import app
client = app.test_client()
# Test with file
response = client.post(
"/convert",
data={"file": (io.BytesIO(b"test content"), "test.pdf")},
content_type="multipart/form-data",
)
assert response.status_code == 200
for test_string in SERVE_TEST_STRINGS:
assert test_string in response.json["content"]
# Test with URL
response = client.post(
"/convert",
data={"url": PDF_TEST_URL},
)
assert response.status_code == 200
for test_string in SERVE_TEST_STRINGS:
assert test_string in response.json["content"]
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
@ -337,3 +368,4 @@ if __name__ == "__main__":
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
test_markitdown_serve()