Add Ollama integration for image descriptions

---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).
This commit is contained in:
Tom 2025-01-03 13:48:19 -07:00
parent 125e206047
commit e2470fc413
10 changed files with 232 additions and 20 deletions

View file

@ -1,32 +1,21 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{ {
"name": "Existing Dockerfile", "name": "Existing Dockerfile",
"build": { "build": {
// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..", "context": "..",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "../Dockerfile", "dockerfile": "../Dockerfile",
"args": { "args": {
"INSTALL_GIT": "true" "INSTALL_GIT": "true"
} }
}, },
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
"features": { "features": {
"ghcr.io/devcontainers-extra/features/hatch:2": {} "ghcr.io/devcontainers-extra/features/hatch:2": {},
"ghcr.io/devcontainers/features/python:1": {
"version": "3.10"
},
"ghcr.io/devcontainers/features/node:1": {
"version": "16"
},
"ghcr.io/devcontainers/features/ollama:1": {}
}, },
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Uncomment the next line to run commands after the container is created.
// "postCreateCommand": "cat /etc/os-release",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "root" "remoteUser": "root"
} }

View file

@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \ ffmpeg \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN pip install markitdown RUN pip install markitdown ollama
# Default USERID and GROUPID # Default USERID and GROUPID
ARG USERID=10000 ARG USERID=10000

View file

@ -66,6 +66,18 @@ result = md.convert("example.jpg")
print(result.text_content) print(result.text_content)
``` ```
To use Ollama for image descriptions, provide `ollama_client`:
```python
from markitdown import MarkItDown
from ollama import Ollama
client = Ollama(api_key="your-api-key")
md = MarkItDown(ollama_client=client)
result = md.convert("example.jpg")
print(result.text_content)
```
### Docker ### Docker
```sh ```sh

View file

@ -1076,6 +1076,54 @@ class ImageConverter(MediaConverter):
return response.choices[0].message.content return response.choices[0].message.content
class OllamaConverter(DocumentConverter):
"""
Converts images to markdown via description using Ollama API.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
return None
md_content = ""
# Try describing the image with Ollama
ollama_client = kwargs.get("ollama_client")
if ollama_client is not None:
md_content += (
"\n# Description:\n"
+ self._get_ollama_description(
local_path,
extension,
ollama_client,
prompt=kwargs.get("ollama_prompt"),
).strip()
+ "\n"
)
return DocumentConverterResult(
title=None,
text_content=md_content,
)
def _get_ollama_description(self, local_path, extension, client, prompt=None):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
data_uri = ""
with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
response = client.describe_image(data_uri, prompt)
return response["description"]
class ZipConverter(DocumentConverter): class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files. """Converts ZIP files to markdown by extracting and converting all contained files.
@ -1223,6 +1271,7 @@ class MarkItDown:
llm_client: Optional[Any] = None, llm_client: Optional[Any] = None,
llm_model: Optional[str] = None, llm_model: Optional[str] = None,
style_map: Optional[str] = None, style_map: Optional[str] = None,
ollama_client: Optional[Any] = None,
# Deprecated # Deprecated
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None, mlm_model: Optional[str] = None,
@ -1264,6 +1313,7 @@ class MarkItDown:
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model
self._style_map = style_map self._style_map = style_map
self._ollama_client = ollama_client
self._page_converters: List[DocumentConverter] = [] self._page_converters: List[DocumentConverter] = []
@ -1285,6 +1335,7 @@ class MarkItDown:
self.register_page_converter(IpynbConverter()) self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
self.register_page_converter(OllamaConverter())
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
@ -1445,6 +1496,9 @@ class MarkItDown:
if "llm_model" not in _kwargs and self._llm_model is not None: if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model _kwargs["llm_model"] = self._llm_model
if "ollama_client" not in _kwargs and self._ollama_client is not None:
_kwargs["ollama_client"] = self._ollama_client
# Add the list of converters for nested processing # Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters _kwargs["_parent_converters"] = self._page_converters

View file

@ -25,6 +25,13 @@ except ModuleNotFoundError:
# Skip exiftool tests if not installed # Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None skip_exiftool = shutil.which("exiftool") is None
# Skip Ollama tests if not installed
skip_ollama = False if os.environ.get("OLLAMA_API_KEY") else True
try:
import ollama
except ModuleNotFoundError:
skip_ollama = True
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
JPG_TEST_EXIFTOOL = { JPG_TEST_EXIFTOOL = {
@ -130,6 +137,11 @@ LLM_TEST_STRINGS = [
"5bda1dd6", "5bda1dd6",
] ]
OLLAMA_TEST_STRINGS = [
"detailed caption",
"image",
]
# --- Helper Functions --- # --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None): def validate_strings(result, expected_strings, exclude_strings=None):
@ -300,6 +312,20 @@ def test_markitdown_llm() -> None:
assert test_string in result.text_content.lower() assert test_string in result.text_content.lower()
@pytest.mark.skipif(
skip_ollama,
reason="do not run ollama tests without a key",
)
def test_markitdown_ollama() -> None:
client = ollama.Ollama(api_key=os.environ.get("OLLAMA_API_KEY"))
markitdown = MarkItDown(ollama_client=client)
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_ollama.jpg"))
for test_string in OLLAMA_TEST_STRINGS:
assert test_string in result.text_content
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
@ -307,3 +333,4 @@ if __name__ == "__main__":
test_markitdown_exiftool() test_markitdown_exiftool()
test_markitdown_deprecation() test_markitdown_deprecation()
test_markitdown_llm() test_markitdown_llm()
test_markitdown_ollama()

37
web-ui/package.json Normal file
View file

@ -0,0 +1,37 @@
{
"name": "markitdown-web-ui",
"version": "1.0.0",
"description": "Web-based UI for MarkItDown",
"main": "src/App.js",
"scripts": {
"start": "react-scripts start",
"build": "react-scripts build",
"test": "react-scripts test",
"eject": "react-scripts eject"
},
"dependencies": {
"react": "^17.0.2",
"react-dom": "^17.0.2",
"react-scripts": "4.0.3",
"axios": "^0.21.1",
"react-markdown": "^7.0.0"
},
"eslintConfig": {
"extends": [
"react-app",
"react-app/jest"
]
},
"browserslist": {
"production": [
">0.2%",
"not dead",
"not op_mini all"
],
"development": [
"last 1 chrome version",
"last 1 firefox version",
"last 1 safari version"
]
}
}

42
web-ui/src/App.js Normal file
View file

@ -0,0 +1,42 @@
import React, { useState } from 'react';
import FileUpload from './components/FileUpload';
import MarkdownPreview from './components/MarkdownPreview';
import DownloadButton from './components/DownloadButton';
import axios from 'axios';
function App() {
const [markdownContent, setMarkdownContent] = useState('');
const [fileName, setFileName] = useState('');
const handleFileUpload = async (file) => {
const formData = new FormData();
formData.append('file', file);
try {
const response = await axios.post('/api/convert', formData, {
headers: {
'Content-Type': 'multipart/form-data',
},
});
setMarkdownContent(response.data.markdown);
setFileName(file.name);
} catch (error) {
console.error('Error uploading file:', error);
}
};
return (
<div className="App">
<header className="App-header">
<h1>MarkItDown Web UI</h1>
</header>
<main>
<FileUpload onFileUpload={handleFileUpload} />
<MarkdownPreview content={markdownContent} />
<DownloadButton content={markdownContent} fileName={fileName} />
</main>
</div>
);
}
export default App;

View file

@ -0,0 +1,21 @@
import React from 'react';
function DownloadButton({ content, fileName }) {
const handleDownload = () => {
const element = document.createElement('a');
const file = new Blob([content], { type: 'text/markdown' });
element.href = URL.createObjectURL(file);
element.download = fileName.replace(/\.[^/.]+$/, "") + ".md";
document.body.appendChild(element);
element.click();
document.body.removeChild(element);
};
return (
<button onClick={handleDownload}>
Download Markdown
</button>
);
}
export default DownloadButton;

View file

@ -0,0 +1,18 @@
import React from 'react';
function FileUpload({ onFileUpload }) {
const handleFileChange = (event) => {
const file = event.target.files[0];
if (file) {
onFileUpload(file);
}
};
return (
<div className="file-upload">
<input type="file" onChange={handleFileChange} />
</div>
);
}
export default FileUpload;

View file

@ -0,0 +1,12 @@
import React from 'react';
import ReactMarkdown from 'react-markdown';
function MarkdownPreview({ content }) {
return (
<div className="markdown-preview">
<ReactMarkdown>{content}</ReactMarkdown>
</div>
);
}
export default MarkdownPreview;