Add Ollama integration for image descriptions

--- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).
2025-01-03 13:48:19 -07:00 · 2025-01-03 13:48:19 -07:00 · e2470fc413
commit e2470fc413
parent 125e206047
10 changed files with 232 additions and 20 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -1,32 +1,21 @@
 // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 {
 	"name": "Existing Dockerfile",
 	"build": {
 		// Sets the run context to one level up instead of the .devcontainer folder.
 		"context": "..",
 		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 		"dockerfile": "../Dockerfile",
 		"args": {
 			"INSTALL_GIT": "true"
 		}
 	},
 	// Features to add to the dev container. More info: https://containers.dev/features.
 	// "features": {},
 	"features": {
-		"ghcr.io/devcontainers-extra/features/hatch:2": {}
+		"ghcr.io/devcontainers-extra/features/hatch:2": {},
 		"ghcr.io/devcontainers/features/python:1": {
 			"version": "3.10"
 		},
 		"ghcr.io/devcontainers/features/node:1": {
 			"version": "16"
 		},
 		"ghcr.io/devcontainers/features/ollama:1": {}
 	},
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
 	// "forwardPorts": [],
 	// Uncomment the next line to run commands after the container is created.
 	// "postCreateCommand": "cat /etc/os-release",
 	// Configure tool-specific properties.
 	// "customizations": {},
 	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
 	"remoteUser": "root"
 }
--- a/2
+++ b/2
@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
-RUN pip install markitdown
+RUN pip install markitdown ollama
 # Default USERID and GROUPID
 ARG USERID=10000
--- a/README.md
+++ b/README.md
@ -66,6 +66,18 @@ result = md.convert("example.jpg")
 print(result.text_content)
 ```
 To use Ollama for image descriptions, provide `ollama_client`:
 ```python
 from markitdown import MarkItDown
 from ollama import Ollama
 client = Ollama(api_key="your-api-key")
 md = MarkItDown(ollama_client=client)
 result = md.convert("example.jpg")
 print(result.text_content)
 ```
 ### Docker
 ```sh
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -1076,6 +1076,54 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content
 class OllamaConverter(DocumentConverter):
    """
    Converts images to markdown via description using Ollama API.
    """
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not an image
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
            return None
        md_content = ""
        # Try describing the image with Ollama
        ollama_client = kwargs.get("ollama_client")
        if ollama_client is not None:
            md_content += (
                "\n# Description:\n"
                + self._get_ollama_description(
                    local_path,
                    extension,
                    ollama_client,
                    prompt=kwargs.get("ollama_prompt"),
                ).strip()
                + "\n"
            )
        return DocumentConverterResult(
            title=None,
            text_content=md_content,
        )
    def _get_ollama_description(self, local_path, extension, client, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        data_uri = ""
        with open(local_path, "rb") as image_file:
            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
            if content_type is None:
                content_type = "image/jpeg"
            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
            data_uri = f"data:{content_type};base64,{image_base64}"
        response = client.describe_image(data_uri, prompt)
        return response["description"]
 class ZipConverter(DocumentConverter):
    """Converts ZIP files to markdown by extracting and converting all contained files.
@ -1223,6 +1271,7 @@ class MarkItDown:
        llm_client: Optional[Any] = None,
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        ollama_client: Optional[Any] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
        mlm_model: Optional[str] = None,
@ -1264,6 +1313,7 @@ class MarkItDown:
        self._llm_client = llm_client
        self._llm_model = llm_model
        self._style_map = style_map
        self._ollama_client = ollama_client
        self._page_converters: List[DocumentConverter] = []
@ -1285,6 +1335,7 @@ class MarkItDown:
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
        self.register_page_converter(OllamaConverter())
    def convert(
        self, source: Union[str, requests.Response, Path], **kwargs: Any
@ -1445,6 +1496,9 @@ class MarkItDown:
                if "llm_model" not in _kwargs and self._llm_model is not None:
                    _kwargs["llm_model"] = self._llm_model
                if "ollama_client" not in _kwargs and self._ollama_client is not None:
                    _kwargs["ollama_client"] = self._ollama_client
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -25,6 +25,13 @@ except ModuleNotFoundError:
 # Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None
 # Skip Ollama tests if not installed
 skip_ollama = False if os.environ.get("OLLAMA_API_KEY") else True
 try:
    import ollama
 except ModuleNotFoundError:
    skip_ollama = True
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 JPG_TEST_EXIFTOOL = {
@ -130,6 +137,11 @@ LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]
 OLLAMA_TEST_STRINGS = [
    "detailed caption",
    "image",
 ]
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
@ -300,6 +312,20 @@ def test_markitdown_llm() -> None:
        assert test_string in result.text_content.lower()
@pytest.mark.skipif(
    skip_ollama,
    reason="do not run ollama tests without a key",
 )
 def test_markitdown_ollama() -> None:
    client = ollama.Ollama(api_key=os.environ.get("OLLAMA_API_KEY"))
    markitdown = MarkItDown(ollama_client=client)
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_ollama.jpg"))
    for test_string in OLLAMA_TEST_STRINGS:
        assert test_string in result.text_content
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    test_markitdown_remote()
@ -307,3 +333,4 @@ if __name__ == "__main__":
    test_markitdown_exiftool()
    test_markitdown_deprecation()
    test_markitdown_llm()
    test_markitdown_ollama()
--- a/web-ui/package.json
+++ b/web-ui/package.json
@ -0,0 +1,37 @@
 {
  "name": "markitdown-web-ui",
  "version": "1.0.0",
  "description": "Web-based UI for MarkItDown",
  "main": "src/App.js",
  "scripts": {
    "start": "react-scripts start",
    "build": "react-scripts build",
    "test": "react-scripts test",
    "eject": "react-scripts eject"
  },
  "dependencies": {
    "react": "^17.0.2",
    "react-dom": "^17.0.2",
    "react-scripts": "4.0.3",
    "axios": "^0.21.1",
    "react-markdown": "^7.0.0"
  },
  "eslintConfig": {
    "extends": [
      "react-app",
      "react-app/jest"
    ]
  },
  "browserslist": {
    "production": [
      ">0.2%",
      "not dead",
      "not op_mini all"
    ],
    "development": [
      "last 1 chrome version",
      "last 1 firefox version",
      "last 1 safari version"
    ]
  }
 }
--- a/web-ui/src/App.js
+++ b/web-ui/src/App.js
@ -0,0 +1,42 @@
 import React, { useState } from 'react';
 import FileUpload from './components/FileUpload';
 import MarkdownPreview from './components/MarkdownPreview';
 import DownloadButton from './components/DownloadButton';
 import axios from 'axios';
 function App() {
  const [markdownContent, setMarkdownContent] = useState('');
  const [fileName, setFileName] = useState('');
  const handleFileUpload = async (file) => {
    const formData = new FormData();
    formData.append('file', file);
    try {
      const response = await axios.post('/api/convert', formData, {
        headers: {
          'Content-Type': 'multipart/form-data',
        },
      });
      setMarkdownContent(response.data.markdown);
      setFileName(file.name);
    } catch (error) {
      console.error('Error uploading file:', error);
    }
  };
  return (
    <div className="App">
      <header className="App-header">
        <h1>MarkItDown Web UI</h1>
      </header>
      <main>
        <FileUpload onFileUpload={handleFileUpload} />
        <MarkdownPreview content={markdownContent} />
        <DownloadButton content={markdownContent} fileName={fileName} />
      </main>
    </div>
  );
 }
 export default App;
--- a/web-ui/src/components/DownloadButton.js
+++ b/web-ui/src/components/DownloadButton.js
@ -0,0 +1,21 @@
 import React from 'react';
 function DownloadButton({ content, fileName }) {
  const handleDownload = () => {
    const element = document.createElement('a');
    const file = new Blob([content], { type: 'text/markdown' });
    element.href = URL.createObjectURL(file);
    element.download = fileName.replace(/\.[^/.]+$/, "") + ".md";
    document.body.appendChild(element);
    element.click();
    document.body.removeChild(element);
  };
  return (
    <button onClick={handleDownload}>
      Download Markdown
    </button>
  );
 }
 export default DownloadButton;
--- a/web-ui/src/components/FileUpload.js
+++ b/web-ui/src/components/FileUpload.js
@ -0,0 +1,18 @@
 import React from 'react';
 function FileUpload({ onFileUpload }) {
  const handleFileChange = (event) => {
    const file = event.target.files[0];
    if (file) {
      onFileUpload(file);
    }
  };
  return (
    <div className="file-upload">
      <input type="file" onChange={handleFileChange} />
    </div>
  );
 }
 export default FileUpload;
--- a/web-ui/src/components/MarkdownPreview.js
+++ b/web-ui/src/components/MarkdownPreview.js
@ -0,0 +1,12 @@
 import React from 'react';
 import ReactMarkdown from 'react-markdown';
 function MarkdownPreview({ content }) {
  return (
    <div className="markdown-preview">
      <ReactMarkdown>{content}</ReactMarkdown>
    </div>
  );
 }
 export default MarkdownPreview;