Merge branch 'main' into wangsrGit119-patch-1

This commit is contained in:
suke 2024-12-23 10:54:52 +08:00 committed by GitHub
commit 5a3ca479f1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 936 additions and 119 deletions

View file

@ -0,0 +1,32 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{
"name": "Existing Dockerfile",
"build": {
// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "../Dockerfile",
"args": {
"INSTALL_GIT": "true"
}
},
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
"features": {
"ghcr.io/devcontainers-extra/features/hatch:2": {}
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Uncomment the next line to run commands after the container is created.
// "postCreateCommand": "cat /etc/os-release",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "root"
}

1
.dockerignore Normal file
View file

@ -0,0 +1 @@
*

1
.gitattributes vendored Normal file
View file

@ -0,0 +1 @@
tests/test_files/** linguist-vendored

6
.github/dependabot.yml vendored Normal file
View file

@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"

View file

@ -5,9 +5,9 @@ jobs:
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: "3.x" python-version: "3.x"

View file

@ -5,8 +5,8 @@ jobs:
tests: tests:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- uses: actions/setup-python@v4 - uses: actions/setup-python@v5
with: with:
python-version: | python-version: |
3.10 3.10
@ -14,7 +14,7 @@ jobs:
3.12 3.12
- name: Set up pip cache - name: Set up pip cache
if: runner.os == 'Linux' if: runner.os == 'Linux'
uses: actions/cache@v3 uses: actions/cache@v4
with: with:
path: ~/.cache/pip path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}

4
.gitignore vendored
View file

@ -1,3 +1,5 @@
.vscode
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
@ -160,3 +162,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
src/.DS_Store
.DS_Store

23
Dockerfile Normal file
View file

@ -0,0 +1,23 @@
FROM python:3.13-slim-bullseye
USER root
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

130
README.md
View file

@ -1,56 +1,114 @@
> [!IMPORTANT]
> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
# MarkItDown # MarkItDown
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
It presently supports:
- PDF (.pdf) MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
- PowerPoint (.pptx) It supports:
- Word (.docx) - PDF
- Excel (.xlsx) - PowerPoint
- Images (EXIF metadata, and OCR) - Word
- Audio (EXIF metadata, and speech transcription) - Excel
- HTML (special handling of Wikipedia, etc.) - Images (EXIF metadata and OCR)
- Various other text-based formats (csv, json, xml, etc.) - Audio (EXIF metadata and speech transcription)
- HTML
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
# Installation To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
You can install `markitdown` using pip: ## Usage
```python ### Command-Line
pip install markitdown
```bash
markitdown path-to-file.pdf > document.md
``` ```
or from the source Or use `-o` to specify the output file:
```sh ```bash
pip install -e . markitdown path-to-file.pdf -o document.md
``` ```
You can also pipe content:
# Usage ```bash
The API is simple: cat path-to-file.pdf | markitdown
```
### Python API
Basic usage in Python:
```python ```python
from markitdown import MarkItDown from markitdown import MarkItDown
markitdown = MarkItDown() md = MarkItDown()
result = markitdown.convert("test.xlsx") result = md.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python ```python
from markitdown import MarkItDown from markitdown import MarkItDown
from openai import OpenAI from openai import OpenAI
client = OpenAI() client = OpenAI()
md = MarkItDown(mlm_client=client, mlm_model="gpt-4o") md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("example.jpg") result = md.convert("example.jpg")
print(result.text_content) print(result.text_content)
``` ```
### Docker
```sh
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing ## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a
@ -65,21 +123,37 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
### Running Tests ### How to Contribute
To run the tests for this project, use the following command: You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
<div align="center">
| | All | Especially Needs Help from Community |
|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
</div>
### Running Tests and Checks
- Install `hatch` in your environment and run tests:
```sh ```sh
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
hatch shell hatch shell
hatch test hatch test
``` ```
### Running Pre-commit Checks (Alternative) Use the Devcontainer which has all the dependencies installed:
```sh ```sh
pre-commit run --all-files # Reopen the project in Devcontainer and run:
hatch test
``` ```
- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
## Trademarks ## Trademarks
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft

View file

@ -5,7 +5,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "markitdown" name = "markitdown"
dynamic = ["version"] dynamic = ["version"]
description = '' description = 'Utility tool for converting various files to Markdown'
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
license = "MIT" license = "MIT"
@ -38,6 +38,8 @@ dependencies = [
"youtube-transcript-api", "youtube-transcript-api",
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
"charset-normalizer",
"openai",
] ]
[project.urls] [project.urls]
@ -76,3 +78,6 @@ exclude_lines = [
"if __name__ == .__main__.:", "if __name__ == .__main__.:",
"if TYPE_CHECKING:", "if TYPE_CHECKING:",
] ]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"]

View file

@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a1" __version__ = "0.0.1a3"

View file

@ -1,21 +1,19 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import argparse
import sys import sys
from ._markitdown import MarkItDown from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
def main(): def main():
if len(sys.argv) == 1: parser = argparse.ArgumentParser(
markitdown = MarkItDown() description="Convert various file formats to markdown.",
result = markitdown.convert_stream(sys.stdin.buffer) prog="markitdown",
print(result.text_content) formatter_class=argparse.RawDescriptionHelpFormatter,
elif len(sys.argv) == 2: usage=dedent(
markitdown = MarkItDown()
result = markitdown.convert(sys.argv[1])
print(result.text_content)
else:
sys.stderr.write(
""" """
SYNTAX: SYNTAX:
@ -33,10 +31,52 @@ EXAMPLE:
OR OR
markitdown < example.pdf markitdown < example.pdf
""".strip()
+ "\n" OR to save to a file use
markitdown example.pdf -o example.md
OR
markitdown example.pdf > example.md
"""
).strip(),
) )
parser.add_argument(
"-v",
"--version",
action="version",
version=f"%(prog)s {__version__}",
help="show the version number and exit",
)
parser.add_argument("filename", nargs="?")
parser.add_argument(
"-o",
"--output",
help="Output file name. If not provided, output is written to stdout.",
)
args = parser.parse_args()
if args.filename is None:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
_handle_output(args, result)
else:
markitdown = MarkItDown()
result = markitdown.convert(args.filename)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.text_content)
else:
print(result.text_content)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -12,8 +12,12 @@ import subprocess
import sys import sys
import tempfile import tempfile
import traceback import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@ -26,15 +30,24 @@ import pptx
import puremagic import puremagic
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Optional Transcription support # Optional Transcription support
try: try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@ -161,9 +174,7 @@ class PlainTextConverter(DocumentConverter):
elif "text/" not in content_type.lower(): elif "text/" not in content_type.lower():
return None return None
text_content = "" text_content = str(from_path(local_path).best())
with open(local_path, "rt", encoding="utf-8") as fh:
text_content = fh.read()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=text_content, text_content=text_content,
@ -213,6 +224,143 @@ class HtmlConverter(DocumentConverter):
) )
class RSSConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]:
return None
try:
doc = minidom.parse(local_path)
except BaseException as _:
return None
result = None
if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss>
result = self._parse_rss_type(doc)
elif doc.getElementsByTagName("feed"):
root = doc.getElementsByTagName("feed")[0]
if root.getElementsByTagName("entry"):
# An Atom feed must have a root element of <feed> and at least one <entry>
result = self._parse_atom_type(doc)
else:
return None
else:
# not rss or atom
return None
return result
def _parse_atom_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("feed")[0]
title = self._get_data_by_tag_name(root, "title")
subtitle = self._get_data_by_tag_name(root, "subtitle")
entries = root.getElementsByTagName("entry")
md_text = f"# {title}\n"
if subtitle:
md_text += f"{subtitle}\n"
for entry in entries:
entry_title = self._get_data_by_tag_name(entry, "title")
entry_summary = self._get_data_by_tag_name(entry, "summary")
entry_updated = self._get_data_by_tag_name(entry, "updated")
entry_content = self._get_data_by_tag_name(entry, "content")
if entry_title:
md_text += f"\n## {entry_title}\n"
if entry_updated:
md_text += f"Updated on: {entry_updated}\n"
if entry_summary:
md_text += self._parse_content(entry_summary)
if entry_content:
md_text += self._parse_content(entry_content)
return DocumentConverterResult(
title=title,
text_content=md_text,
)
except BaseException as _:
return None
def _parse_rss_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("rss")[0]
channel = root.getElementsByTagName("channel")
if not channel:
return None
channel = channel[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
if channel_title:
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
pubDate = self._get_data_by_tag_name(item, "pubDate")
content = self._get_data_by_tag_name(item, "content:encoded")
if title:
md_text += f"\n## {title}\n"
if pubDate:
md_text += f"Published on: {pubDate}\n"
if description:
md_text += self._parse_content(description)
if content:
md_text += self._parse_content(content)
return DocumentConverterResult(
title=channel_title,
text_content=md_text,
)
except BaseException as _:
print(traceback.format_exc())
return None
def _parse_content(self, content: str) -> str:
"""Parse the content of an RSS feed item"""
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
except BaseException as _:
return content
def _get_data_by_tag_name(
self, element: minidom.Element, tag_name: str
) -> Union[str, None]:
"""Get data from first child element with the given tag name.
Returns None when no such element is found.
"""
nodes = element.getElementsByTagName(tag_name)
if not nodes:
return None
fc = nodes[0].firstChild
if fc:
return fc.data
return None
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content.""" """Handle Wikipedia pages separately, focusing only on the main document content."""
@ -344,8 +492,11 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(params["v"][0], str) assert isinstance(params["v"][0], str)
video_id = str(params["v"][0]) video_id = str(params["v"][0])
try: try:
youtube_transcript_languages = kwargs.get(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript. # Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting: # Alternative formatting:
# formatter = TextFormatter() # formatter = TextFormatter()
@ -391,6 +542,67 @@ class YouTubeConverter(DocumentConverter):
return None return None
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not ipynb
extension = kwargs.get("file_extension", "")
if extension.lower() != ".ipynb":
return None
# Parse and convert the notebook
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
notebook_content = json.load(fh)
result = self._convert(notebook_content)
return result
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
"""Helper function that converts notebook JSON content to Markdown."""
try:
md_output = []
title = None
for cell in notebook_content.get("cells", []):
cell_type = cell.get("cell_type", "")
source_lines = cell.get("source", [])
if cell_type == "markdown":
md_output.append("".join(source_lines))
# Extract the first # heading as title if not already found
if title is None:
for line in source_lines:
if line.startswith("# "):
title = line.lstrip("# ").strip()
break
elif cell_type == "code":
# Code cells are wrapped in Markdown code blocks
md_output.append(f"```python\n{''.join(source_lines)}\n```")
elif cell_type == "raw":
md_output.append(f"```\n{''.join(source_lines)}\n```")
md_text = "\n\n".join(md_output)
# Check for title in notebook metadata
title = notebook_content.get("metadata", {}).get("title", title)
return DocumentConverterResult(
title=title,
text_content=md_text,
)
except Exception as e:
raise FileConversionException(
f"Error converting .ipynb file: {str(e)}"
) from e
class BingSerpConverter(DocumentConverter): class BingSerpConverter(DocumentConverter):
""" """
Handle Bing results pages (only the organic search results). Handle Bing results pages (only the organic search results).
@ -492,7 +704,9 @@ class DocxConverter(HtmlConverter):
result = None result = None
with open(local_path, "rb") as docx_file: with open(local_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file) style_map = kwargs.get("style_map", None)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value html_content = result.value
result = self._convert(html_content) result = self._convert(html_content)
@ -582,6 +796,10 @@ class PptxConverter(HtmlConverter):
"\n" + self._convert(html_table).text_content.strip() + "\n" "\n" + self._convert(html_table).text_content.strip() + "\n"
) )
# Charts
if shape.has_chart:
md_content += self._convert_chart_to_markdown(shape.chart)
# Text areas # Text areas
elif shape.has_text_frame: elif shape.has_text_frame:
if shape == title: if shape == title:
@ -616,6 +834,29 @@ class PptxConverter(HtmlConverter):
return True return True
return False return False
def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart"
if chart.has_title:
md += f": {chart.chart_title.text_frame.text}"
md += "\n\n"
data = []
category_names = [c.label for c in chart.plots[0].categories]
series_names = [s.name for s in chart.series]
data.append(["Category"] + series_names)
for idx, category in enumerate(category_names):
row = [category]
for series in chart.series:
row.append(series.values[idx])
data.append(row)
markdown_table = []
for row in data:
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
header = markdown_table[0]
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\n".join([header, separator] + markdown_table[1:])
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):
""" """
@ -642,7 +883,7 @@ class WavConverter(MediaConverter):
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not a WAV
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav": if extension.lower() != ".wav":
return None return None
@ -754,11 +995,11 @@ class Mp3Converter(WavConverter):
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
""" """
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not an image
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]: if extension.lower() not in [".jpg", ".jpeg", ".png"]:
return None return None
@ -784,17 +1025,17 @@ class ImageConverter(MediaConverter):
md_content += f"{f}: {metadata[f]}\n" md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPTV # Try describing the image with GPTV
mlm_client = kwargs.get("mlm_client") llm_client = kwargs.get("llm_client")
mlm_model = kwargs.get("mlm_model") llm_model = kwargs.get("llm_model")
if mlm_client is not None and mlm_model is not None: if llm_client is not None and llm_model is not None:
md_content += ( md_content += (
"\n# Description:\n" "\n# Description:\n"
+ self._get_mlm_description( + self._get_llm_description(
local_path, local_path,
extension, extension,
mlm_client, llm_client,
mlm_model, llm_model,
prompt=kwargs.get("mlm_prompt"), prompt=kwargs.get("llm_prompt"),
).strip() ).strip()
+ "\n" + "\n"
) )
@ -818,12 +1059,10 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_mlm_description(self, local_path, extension, client, model, prompt=None): def _get_llm_description(self, local_path, extension, client, model, prompt=None):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension) content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@ -851,6 +1090,135 @@ class ImageConverter(MediaConverter):
return response.choices[0].message.content return response.choices[0].message.content
class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
The converter extracts the ZIP contents to a temporary directory, processes each file
using appropriate converters based on file extensions, and then combines the results
into a single markdown document. The temporary directory is cleaned up after processing.
Example output format:
```markdown
Content from the zip file `example.zip`:
## File: docs/readme.txt
This is the content of readme.txt
Multiple lines are preserved
## File: images/example.jpg
ImageSize: 1920x1080
DateTimeOriginal: 2024-02-15 14:30:00
Description: A beautiful landscape photo
## File: data/report.xlsx
## Sheet1
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| data1 | data2 | data3 |
| data4 | data5 | data6 |
```
Key features:
- Maintains original file structure in headings
- Processes nested files recursively
- Uses appropriate converters for each file type
- Preserves formatting of converted content
- Cleans up temporary files after processing
"""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None
# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
)
extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
)
extraction_dir = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
)
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
try:
# Extract the zip file safely
with zipfile.ZipFile(local_path, "r") as zipObj:
# Safeguard against path traversal
for member in zipObj.namelist():
member_path = os.path.normpath(os.path.join(extraction_dir, member))
if (
not os.path.commonprefix([extraction_dir, member_path])
== extraction_dir
):
raise ValueError(
f"Path traversal detected in zip file: {member}"
)
# Extract all files safely
zipObj.extractall(path=extraction_dir)
# Process each extracted file
for root, dirs, files in os.walk(extraction_dir):
for name in files:
file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, extraction_dir)
# Get file extension
_, file_extension = os.path.splitext(name)
# Update kwargs for the file
file_kwargs = kwargs.copy()
file_kwargs["file_extension"] = file_extension
file_kwargs["_parent_converters"] = parent_converters
# Try converting the file using available converters
for converter in parent_converters:
# Skip the zip converter to avoid infinite recursion
if isinstance(converter, ZipConverter):
continue
result = converter.convert(file_path, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"
break
# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(extraction_dir)
return DocumentConverterResult(title=None, text_content=md_content.strip())
except zipfile.BadZipFile:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except ValueError as ve:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@ -866,16 +1234,50 @@ class MarkItDown:
def __init__( def __init__(
self, self,
requests_session: Optional[requests.Session] = None, requests_session: Optional[requests.Session] = None,
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[Any] = None, mlm_model: Optional[str] = None,
): ):
if requests_session is None: if requests_session is None:
self._requests_session = requests.Session() self._requests_session = requests.Session()
else: else:
self._requests_session = requests_session self._requests_session = requests_session
self._mlm_client = mlm_client # Handle deprecation notices
self._mlm_model = mlm_model #############################
if mlm_client is not None:
if llm_client is None:
warn(
"'mlm_client' is deprecated, and was renamed 'llm_client'.",
DeprecationWarning,
)
llm_client = mlm_client
mlm_client = None
else:
raise ValueError(
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
)
if mlm_model is not None:
if llm_model is None:
warn(
"'mlm_model' is deprecated, and was renamed 'llm_model'.",
DeprecationWarning,
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client
self._llm_model = llm_model
self._style_map = style_map
self._page_converters: List[DocumentConverter] = [] self._page_converters: List[DocumentConverter] = []
@ -884,6 +1286,7 @@ class MarkItDown:
# To this end, the most specific converters should appear below the most generic converters # To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter()) self.register_page_converter(PlainTextConverter())
self.register_page_converter(HtmlConverter()) self.register_page_converter(HtmlConverter())
self.register_page_converter(RSSConverter())
self.register_page_converter(WikipediaConverter()) self.register_page_converter(WikipediaConverter())
self.register_page_converter(YouTubeConverter()) self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter()) self.register_page_converter(BingSerpConverter())
@ -893,14 +1296,16 @@ class MarkItDown:
self.register_page_converter(WavConverter()) self.register_page_converter(WavConverter())
self.register_page_converter(Mp3Converter()) self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter()) self.register_page_converter(ImageConverter())
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
- source: can be a string representing a path or url, or a requests.response object - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
""" """
@ -917,10 +1322,14 @@ class MarkItDown:
# Request response # Request response
elif isinstance(source, requests.Response): elif isinstance(source, requests.Response):
return self.convert_response(source, **kwargs) return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
return self.convert_local(source, **kwargs)
def convert_local( def convert_local(
self, path: str, **kwargs: Any self, path: Union[str, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
if isinstance(path, Path):
path = str(path)
# Prepare a list of extensions to try (in order of priority) # Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension") ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else [] extensions = [ext] if ext is not None else []
@ -1017,7 +1426,7 @@ class MarkItDown:
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, url=response.url) result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@ -1044,11 +1453,17 @@ class MarkItDown:
_kwargs.update({"file_extension": ext}) _kwargs.update({"file_extension": ext})
# Copy any additional global options # Copy any additional global options
if "mlm_client" not in _kwargs and self._mlm_client is not None: if "llm_client" not in _kwargs and self._llm_client is not None:
_kwargs["mlm_client"] = self._mlm_client _kwargs["llm_client"] = self._llm_client
if "mlm_model" not in _kwargs and self._mlm_model is not None: if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["mlm_model"] = self._mlm_model _kwargs["llm_model"] = self._llm_model
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
try: try:
@ -1085,7 +1500,6 @@ class MarkItDown:
if ext == "": if ext == "":
return return
# if ext not in extensions: # if ext not in extensions:
if True:
extensions.append(ext) extensions.append(ext)
def _guess_ext_magic(self, path): def _guess_ext_magic(self, path):

0
src/markitdown/py.typed Normal file
View file

0
tests/test_files/test.docx vendored Executable file → Normal file
View file

0
tests/test_files/test.jpg vendored Executable file → Normal file
View file

Before

Width:  |  Height:  |  Size: 463 KiB

After

Width:  |  Height:  |  Size: 463 KiB

BIN
tests/test_files/test.pptx vendored Executable file → Normal file

Binary file not shown.

0
tests/test_files/test.xlsx vendored Executable file → Normal file
View file

BIN
tests/test_files/test_files.zip vendored Normal file

Binary file not shown.

BIN
tests/test_files/test_llm.jpg vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

4
tests/test_files/test_mskanji.csv vendored Normal file
View file

@ -0,0 +1,4 @@
¼O,”N—î,<2C>Z<EFBFBD>Š
<EFBFBD>²“¡¾˜Y,30,“Œ‹ž
ŽO؉pŽq,25,å<E28098>ã
îà´<EFBFBD>~,35,–¼ŒÃ‰®
1 –¼‘O ”N—î �Z�Š
2 �²“¡‘¾˜Y 30 “Œ‹ž
3 ŽO–؉pŽq 25 ‘å�ã
4 îà‹´�~ 35 –¼ŒÃ‰®

89
tests/test_files/test_notebook.ipynb vendored Normal file
View file

@ -0,0 +1,89 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0f61db80",
"metadata": {},
"source": [
"# Test Notebook"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3f2a5bbd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"markitdown\n"
]
}
],
"source": [
"print('markitdown')"
]
},
{
"cell_type": "markdown",
"id": "9b9c0468",
"metadata": {},
"source": [
"## Code Cell Below"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "37d8088a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"42\n"
]
}
],
"source": [
"# comment in code\n",
"print(42)"
]
},
{
"cell_type": "markdown",
"id": "2e3177bd",
"metadata": {},
"source": [
"End\n",
"\n",
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
},
"title": "Test Notebook Title"
},
"nbformat": 4,
"nbformat_minor": 5
}

1
tests/test_files/test_rss.xml vendored Normal file

File diff suppressed because one or more lines are too long

BIN
tests/test_files/test_with_comment.docx vendored Normal file

Binary file not shown.

View file

@ -6,11 +6,23 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI ) # Don't run these tests in CI
# Don't run the llm tests without a key and the client library
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
try:
import openai
except ModuleNotFoundError:
skip_llm = True
# Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None skip_exiftool = shutil.which("exiftool") is None
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@ -51,12 +63,25 @@ DOCX_TEST_STRINGS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
] ]
DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592", "1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
] ]
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@ -65,6 +90,13 @@ BLOG_TEST_STRINGS = [
"an example where high cost can easily prevent a generic complex", "an example where high cost can easily prevent a generic complex",
] ]
RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [ WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@ -87,6 +119,28 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D", "data:image/svg+xml,%3Csvg%20width%3D",
] ]
CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]
LLM_TEST_STRINGS = [
"5bda1dd6",
]
# --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None):
"""Validate presence or absence of specific strings."""
text_content = result.text_content.replace("\\", "")
for string in expected_strings:
assert string in text_content
if exclude_strings:
for string in exclude_strings:
assert string not in text_content
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@ -120,50 +174,64 @@ def test_markitdown_local() -> None:
# Test XLSX processing # Test XLSX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
for test_string in XLSX_TEST_STRINGS: validate_strings(result, XLSX_TEST_STRINGS)
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing # Test DOCX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
for test_string in DOCX_TEST_STRINGS: validate_strings(result, DOCX_TEST_STRINGS)
text_content = result.text_content.replace("\\", "")
assert test_string in text_content # Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test PPTX processing # Test PPTX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
for test_string in PPTX_TEST_STRINGS: validate_strings(result, PPTX_TEST_STRINGS)
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test HTML processing # Test HTML processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
) )
for test_string in BLOG_TEST_STRINGS: validate_strings(result, BLOG_TEST_STRINGS)
text_content = result.text_content.replace("\\", "")
assert test_string in text_content # Test ZIP file processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
validate_strings(result, XLSX_TEST_STRINGS)
# Test Wikipedia processing # Test Wikipedia processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
) )
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
for test_string in WIKIPEDIA_TEST_EXCLUDES: validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
assert test_string not in text_content
for test_string in WIKIPEDIA_TEST_STRINGS:
assert test_string in text_content
# Test Bing processing # Test Bing processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
) )
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
for test_string in SERP_TEST_EXCLUDES: validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
assert test_string not in text_content
for test_string in SERP_TEST_STRINGS: # Test RSS processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
validate_strings(result, CSV_CP932_TEST_STRINGS)
@pytest.mark.skipif( @pytest.mark.skipif(
skip_exiftool, skip_exiftool,
@ -179,8 +247,63 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
)
def test_markitdown_llm() -> None:
client = openai.OpenAI()
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
for test_string in LLM_TEST_STRINGS:
assert test_string in result.text_content
# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()