Add Ollama integration for image descriptions
--- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).
This commit is contained in:
parent
125e206047
commit
e2470fc413
10 changed files with 232 additions and 20 deletions
|
|
@ -1,32 +1,21 @@
|
||||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
|
||||||
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
|
|
||||||
{
|
{
|
||||||
"name": "Existing Dockerfile",
|
"name": "Existing Dockerfile",
|
||||||
"build": {
|
"build": {
|
||||||
// Sets the run context to one level up instead of the .devcontainer folder.
|
|
||||||
"context": "..",
|
"context": "..",
|
||||||
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
|
||||||
"dockerfile": "../Dockerfile",
|
"dockerfile": "../Dockerfile",
|
||||||
"args": {
|
"args": {
|
||||||
"INSTALL_GIT": "true"
|
"INSTALL_GIT": "true"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
|
||||||
// "features": {},
|
|
||||||
"features": {
|
"features": {
|
||||||
"ghcr.io/devcontainers-extra/features/hatch:2": {}
|
"ghcr.io/devcontainers-extra/features/hatch:2": {},
|
||||||
|
"ghcr.io/devcontainers/features/python:1": {
|
||||||
|
"version": "3.10"
|
||||||
|
},
|
||||||
|
"ghcr.io/devcontainers/features/node:1": {
|
||||||
|
"version": "16"
|
||||||
|
},
|
||||||
|
"ghcr.io/devcontainers/features/ollama:1": {}
|
||||||
},
|
},
|
||||||
|
|
||||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
|
||||||
// "forwardPorts": [],
|
|
||||||
|
|
||||||
// Uncomment the next line to run commands after the container is created.
|
|
||||||
// "postCreateCommand": "cat /etc/os-release",
|
|
||||||
|
|
||||||
// Configure tool-specific properties.
|
|
||||||
// "customizations": {},
|
|
||||||
|
|
||||||
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
|
|
||||||
"remoteUser": "root"
|
"remoteUser": "root"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip install markitdown
|
RUN pip install markitdown ollama
|
||||||
|
|
||||||
# Default USERID and GROUPID
|
# Default USERID and GROUPID
|
||||||
ARG USERID=10000
|
ARG USERID=10000
|
||||||
|
|
|
||||||
12
README.md
12
README.md
|
|
@ -66,6 +66,18 @@ result = md.convert("example.jpg")
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To use Ollama for image descriptions, provide `ollama_client`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
from ollama import Ollama
|
||||||
|
|
||||||
|
client = Ollama(api_key="your-api-key")
|
||||||
|
md = MarkItDown(ollama_client=client)
|
||||||
|
result = md.convert("example.jpg")
|
||||||
|
print(result.text_content)
|
||||||
|
```
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|
|
||||||
|
|
@ -1076,6 +1076,54 @@ class ImageConverter(MediaConverter):
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Converts images to markdown via description using Ollama API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not an image
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
md_content = ""
|
||||||
|
|
||||||
|
# Try describing the image with Ollama
|
||||||
|
ollama_client = kwargs.get("ollama_client")
|
||||||
|
if ollama_client is not None:
|
||||||
|
md_content += (
|
||||||
|
"\n# Description:\n"
|
||||||
|
+ self._get_ollama_description(
|
||||||
|
local_path,
|
||||||
|
extension,
|
||||||
|
ollama_client,
|
||||||
|
prompt=kwargs.get("ollama_prompt"),
|
||||||
|
).strip()
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=md_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_ollama_description(self, local_path, extension, client, prompt=None):
|
||||||
|
if prompt is None or prompt.strip() == "":
|
||||||
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
|
data_uri = ""
|
||||||
|
with open(local_path, "rb") as image_file:
|
||||||
|
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
||||||
|
if content_type is None:
|
||||||
|
content_type = "image/jpeg"
|
||||||
|
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
|
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||||
|
|
||||||
|
response = client.describe_image(data_uri, prompt)
|
||||||
|
return response["description"]
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||||
|
|
||||||
|
|
@ -1223,6 +1271,7 @@ class MarkItDown:
|
||||||
llm_client: Optional[Any] = None,
|
llm_client: Optional[Any] = None,
|
||||||
llm_model: Optional[str] = None,
|
llm_model: Optional[str] = None,
|
||||||
style_map: Optional[str] = None,
|
style_map: Optional[str] = None,
|
||||||
|
ollama_client: Optional[Any] = None,
|
||||||
# Deprecated
|
# Deprecated
|
||||||
mlm_client: Optional[Any] = None,
|
mlm_client: Optional[Any] = None,
|
||||||
mlm_model: Optional[str] = None,
|
mlm_model: Optional[str] = None,
|
||||||
|
|
@ -1264,6 +1313,7 @@ class MarkItDown:
|
||||||
self._llm_client = llm_client
|
self._llm_client = llm_client
|
||||||
self._llm_model = llm_model
|
self._llm_model = llm_model
|
||||||
self._style_map = style_map
|
self._style_map = style_map
|
||||||
|
self._ollama_client = ollama_client
|
||||||
|
|
||||||
self._page_converters: List[DocumentConverter] = []
|
self._page_converters: List[DocumentConverter] = []
|
||||||
|
|
||||||
|
|
@ -1285,6 +1335,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(IpynbConverter())
|
self.register_page_converter(IpynbConverter())
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
self.register_page_converter(OllamaConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
|
|
@ -1445,6 +1496,9 @@ class MarkItDown:
|
||||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||||
_kwargs["llm_model"] = self._llm_model
|
_kwargs["llm_model"] = self._llm_model
|
||||||
|
|
||||||
|
if "ollama_client" not in _kwargs and self._ollama_client is not None:
|
||||||
|
_kwargs["ollama_client"] = self._ollama_client
|
||||||
|
|
||||||
# Add the list of converters for nested processing
|
# Add the list of converters for nested processing
|
||||||
_kwargs["_parent_converters"] = self._page_converters
|
_kwargs["_parent_converters"] = self._page_converters
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,13 @@ except ModuleNotFoundError:
|
||||||
# Skip exiftool tests if not installed
|
# Skip exiftool tests if not installed
|
||||||
skip_exiftool = shutil.which("exiftool") is None
|
skip_exiftool = shutil.which("exiftool") is None
|
||||||
|
|
||||||
|
# Skip Ollama tests if not installed
|
||||||
|
skip_ollama = False if os.environ.get("OLLAMA_API_KEY") else True
|
||||||
|
try:
|
||||||
|
import ollama
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
skip_ollama = True
|
||||||
|
|
||||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||||
|
|
||||||
JPG_TEST_EXIFTOOL = {
|
JPG_TEST_EXIFTOOL = {
|
||||||
|
|
@ -130,6 +137,11 @@ LLM_TEST_STRINGS = [
|
||||||
"5bda1dd6",
|
"5bda1dd6",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
OLLAMA_TEST_STRINGS = [
|
||||||
|
"detailed caption",
|
||||||
|
"image",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# --- Helper Functions ---
|
# --- Helper Functions ---
|
||||||
def validate_strings(result, expected_strings, exclude_strings=None):
|
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||||
|
|
@ -300,6 +312,20 @@ def test_markitdown_llm() -> None:
|
||||||
assert test_string in result.text_content.lower()
|
assert test_string in result.text_content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
skip_ollama,
|
||||||
|
reason="do not run ollama tests without a key",
|
||||||
|
)
|
||||||
|
def test_markitdown_ollama() -> None:
|
||||||
|
client = ollama.Ollama(api_key=os.environ.get("OLLAMA_API_KEY"))
|
||||||
|
markitdown = MarkItDown(ollama_client=client)
|
||||||
|
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_ollama.jpg"))
|
||||||
|
|
||||||
|
for test_string in OLLAMA_TEST_STRINGS:
|
||||||
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
|
|
@ -307,3 +333,4 @@ if __name__ == "__main__":
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
test_markitdown_deprecation()
|
test_markitdown_deprecation()
|
||||||
test_markitdown_llm()
|
test_markitdown_llm()
|
||||||
|
test_markitdown_ollama()
|
||||||
|
|
|
||||||
37
web-ui/package.json
Normal file
37
web-ui/package.json
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
{
|
||||||
|
"name": "markitdown-web-ui",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Web-based UI for MarkItDown",
|
||||||
|
"main": "src/App.js",
|
||||||
|
"scripts": {
|
||||||
|
"start": "react-scripts start",
|
||||||
|
"build": "react-scripts build",
|
||||||
|
"test": "react-scripts test",
|
||||||
|
"eject": "react-scripts eject"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"react": "^17.0.2",
|
||||||
|
"react-dom": "^17.0.2",
|
||||||
|
"react-scripts": "4.0.3",
|
||||||
|
"axios": "^0.21.1",
|
||||||
|
"react-markdown": "^7.0.0"
|
||||||
|
},
|
||||||
|
"eslintConfig": {
|
||||||
|
"extends": [
|
||||||
|
"react-app",
|
||||||
|
"react-app/jest"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"browserslist": {
|
||||||
|
"production": [
|
||||||
|
">0.2%",
|
||||||
|
"not dead",
|
||||||
|
"not op_mini all"
|
||||||
|
],
|
||||||
|
"development": [
|
||||||
|
"last 1 chrome version",
|
||||||
|
"last 1 firefox version",
|
||||||
|
"last 1 safari version"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
42
web-ui/src/App.js
Normal file
42
web-ui/src/App.js
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
import React, { useState } from 'react';
|
||||||
|
import FileUpload from './components/FileUpload';
|
||||||
|
import MarkdownPreview from './components/MarkdownPreview';
|
||||||
|
import DownloadButton from './components/DownloadButton';
|
||||||
|
import axios from 'axios';
|
||||||
|
|
||||||
|
function App() {
|
||||||
|
const [markdownContent, setMarkdownContent] = useState('');
|
||||||
|
const [fileName, setFileName] = useState('');
|
||||||
|
|
||||||
|
const handleFileUpload = async (file) => {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('file', file);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.post('/api/convert', formData, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'multipart/form-data',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
setMarkdownContent(response.data.markdown);
|
||||||
|
setFileName(file.name);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error uploading file:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="App">
|
||||||
|
<header className="App-header">
|
||||||
|
<h1>MarkItDown Web UI</h1>
|
||||||
|
</header>
|
||||||
|
<main>
|
||||||
|
<FileUpload onFileUpload={handleFileUpload} />
|
||||||
|
<MarkdownPreview content={markdownContent} />
|
||||||
|
<DownloadButton content={markdownContent} fileName={fileName} />
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export default App;
|
||||||
21
web-ui/src/components/DownloadButton.js
Normal file
21
web-ui/src/components/DownloadButton.js
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
import React from 'react';
|
||||||
|
|
||||||
|
function DownloadButton({ content, fileName }) {
|
||||||
|
const handleDownload = () => {
|
||||||
|
const element = document.createElement('a');
|
||||||
|
const file = new Blob([content], { type: 'text/markdown' });
|
||||||
|
element.href = URL.createObjectURL(file);
|
||||||
|
element.download = fileName.replace(/\.[^/.]+$/, "") + ".md";
|
||||||
|
document.body.appendChild(element);
|
||||||
|
element.click();
|
||||||
|
document.body.removeChild(element);
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<button onClick={handleDownload}>
|
||||||
|
Download Markdown
|
||||||
|
</button>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export default DownloadButton;
|
||||||
18
web-ui/src/components/FileUpload.js
Normal file
18
web-ui/src/components/FileUpload.js
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
import React from 'react';
|
||||||
|
|
||||||
|
function FileUpload({ onFileUpload }) {
|
||||||
|
const handleFileChange = (event) => {
|
||||||
|
const file = event.target.files[0];
|
||||||
|
if (file) {
|
||||||
|
onFileUpload(file);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="file-upload">
|
||||||
|
<input type="file" onChange={handleFileChange} />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export default FileUpload;
|
||||||
12
web-ui/src/components/MarkdownPreview.js
Normal file
12
web-ui/src/components/MarkdownPreview.js
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
import React from 'react';
|
||||||
|
import ReactMarkdown from 'react-markdown';
|
||||||
|
|
||||||
|
function MarkdownPreview({ content }) {
|
||||||
|
return (
|
||||||
|
<div className="markdown-preview">
|
||||||
|
<ReactMarkdown>{content}</ReactMarkdown>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export default MarkdownPreview;
|
||||||
Loading…
Reference in a new issue