Merge branch 'main' into add-async-wrapper

This commit is contained in:
Raduan77 2024-12-18 10:42:53 +01:00
commit 60d2840656
20 changed files with 799 additions and 59 deletions

View file

@ -0,0 +1,29 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{
"name": "Existing Dockerfile",
"build": {
// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "../Dockerfile"
},
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
"features": {
"ghcr.io/devcontainers-extra/features/hatch:2": {}
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Uncomment the next line to run commands after the container is created.
// "postCreateCommand": "cat /etc/os-release",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "root"
}

1
.dockerignore Normal file
View file

@ -0,0 +1 @@
*

1
.gitattributes vendored Normal file
View file

@ -0,0 +1 @@
tests/test_files/** linguist-vendored

18
Dockerfile Normal file
View file

@ -0,0 +1,18 @@
FROM python:3.13-slim-bullseye
USER root
# Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

View file

@ -1,28 +1,66 @@
# MarkItDown # MarkItDown
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
It presently supports: MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
It supports:
- PDF
- PowerPoint
- Word
- Excel
- Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription)
- HTML
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
- PDF (.pdf) To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
- PowerPoint (.pptx)
- Word (.docx)
- Excel (.xlsx)
- Images (EXIF metadata, and OCR)
- Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.)
The API is simple: ## Usage
### Command-Line
```bash
markitdown path-to-file.pdf > document.md
```
You can also pipe content:
```bash
cat path-to-file.pdf | markitdown
```
### Python API
Basic usage in Python:
```python ```python
from markitdown import MarkItDown from markitdown import MarkItDown
markitdown = MarkItDown() md = MarkItDown()
result = markitdown.convert("test.xlsx") result = md.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python
from markitdown import MarkItDown
from openai import OpenAI
client = OpenAI()
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("example.jpg")
print(result.text_content)
```
### Docker
```sh
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
## Contributing ## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a
@ -37,10 +75,27 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
### Running Tests and Checks
- Install `hatch` in your environment and run tests:
```sh
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
hatch shell
hatch test
```
(Alternative) Use the Devcontainer which has all the dependencies installed:
```sh
# Reopen the project in Devcontainer and run:
hatch test
```
- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
## Trademarks ## Trademarks
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies. Any use of third-party trademarks or logos are subject to those third-party's policies.

View file

@ -38,6 +38,8 @@ dependencies = [
"youtube-transcript-api", "youtube-transcript-api",
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
"charset-normalizer",
"openai",
] ]
[tool.uv] [tool.uv]
@ -82,3 +84,6 @@ exclude_lines = [
"if __name__ == .__main__.:", "if __name__ == .__main__.:",
"if TYPE_CHECKING:", "if TYPE_CHECKING:",
] ]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"]

View file

@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a1" __version__ = "0.0.1a3"

View file

@ -2,21 +2,15 @@
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import sys import sys
import argparse
from ._markitdown import MarkItDown from ._markitdown import MarkItDown
def main(): def main():
if len(sys.argv) == 1: parser = argparse.ArgumentParser(
markitdown = MarkItDown() description="Convert various file formats to markdown.",
result = markitdown.convert_stream(sys.stdin.buffer) formatter_class=argparse.RawDescriptionHelpFormatter,
print(result.text_content) usage="""
elif len(sys.argv) == 2:
markitdown = MarkItDown()
result = markitdown.convert(sys.argv[1])
print(result.text_content)
else:
sys.stderr.write(
"""
SYNTAX: SYNTAX:
markitdown <OPTIONAL: FILENAME> markitdown <OPTIONAL: FILENAME>
@ -33,9 +27,20 @@ EXAMPLE:
OR OR
markitdown < example.pdf markitdown < example.pdf
""".strip() """.strip(),
+ "\n" )
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
if args.filename is None:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
print(result.text_content)
else:
markitdown = MarkItDown()
result = markitdown.convert(args.filename)
print(result.text_content)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -12,8 +12,11 @@ import subprocess
import sys import sys
import tempfile import tempfile
import traceback import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@ -26,15 +29,24 @@ import pptx
import puremagic import puremagic
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Optional Transcription support # Optional Transcription support
try: try:
import pydub # Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@ -161,9 +173,7 @@ class PlainTextConverter(DocumentConverter):
elif "text/" not in content_type.lower(): elif "text/" not in content_type.lower():
return None return None
text_content = "" text_content = str(from_path(local_path).best())
with open(local_path, "rt", encoding="utf-8") as fh:
text_content = fh.read()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=text_content, text_content=text_content,
@ -213,6 +223,143 @@ class HtmlConverter(DocumentConverter):
) )
class RSSConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]:
return None
try:
doc = minidom.parse(local_path)
except BaseException as _:
return None
result = None
if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss>
result = self._parse_rss_type(doc)
elif doc.getElementsByTagName("feed"):
root = doc.getElementsByTagName("feed")[0]
if root.getElementsByTagName("entry"):
# An Atom feed must have a root element of <feed> and at least one <entry>
result = self._parse_atom_type(doc)
else:
return None
else:
# not rss or atom
return None
return result
def _parse_atom_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("feed")[0]
title = self._get_data_by_tag_name(root, "title")
subtitle = self._get_data_by_tag_name(root, "subtitle")
entries = root.getElementsByTagName("entry")
md_text = f"# {title}\n"
if subtitle:
md_text += f"{subtitle}\n"
for entry in entries:
entry_title = self._get_data_by_tag_name(entry, "title")
entry_summary = self._get_data_by_tag_name(entry, "summary")
entry_updated = self._get_data_by_tag_name(entry, "updated")
entry_content = self._get_data_by_tag_name(entry, "content")
if entry_title:
md_text += f"\n## {entry_title}\n"
if entry_updated:
md_text += f"Updated on: {entry_updated}\n"
if entry_summary:
md_text += self._parse_content(entry_summary)
if entry_content:
md_text += self._parse_content(entry_content)
return DocumentConverterResult(
title=title,
text_content=md_text,
)
except BaseException as _:
return None
def _parse_rss_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("rss")[0]
channel = root.getElementsByTagName("channel")
if not channel:
return None
channel = channel[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
if channel_title:
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
pubDate = self._get_data_by_tag_name(item, "pubDate")
content = self._get_data_by_tag_name(item, "content:encoded")
if title:
md_text += f"\n## {title}\n"
if pubDate:
md_text += f"Published on: {pubDate}\n"
if description:
md_text += self._parse_content(description)
if content:
md_text += self._parse_content(content)
return DocumentConverterResult(
title=channel_title,
text_content=md_text,
)
except BaseException as _:
print(traceback.format_exc())
return None
def _parse_content(self, content: str) -> str:
"""Parse the content of an RSS feed item"""
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
except BaseException as _:
return content
def _get_data_by_tag_name(
self, element: minidom.Element, tag_name: str
) -> Union[str, None]:
"""Get data from first child element with the given tag name.
Returns None when no such element is found.
"""
nodes = element.getElementsByTagName(tag_name)
if not nodes:
return None
fc = nodes[0].firstChild
if fc:
return fc.data
return None
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content.""" """Handle Wikipedia pages separately, focusing only on the main document content."""
@ -344,8 +491,11 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(params["v"][0], str) assert isinstance(params["v"][0], str)
video_id = str(params["v"][0]) video_id = str(params["v"][0])
try: try:
youtube_transcript_languages = kwargs.get(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript. # Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting: # Alternative formatting:
# formatter = TextFormatter() # formatter = TextFormatter()
@ -391,6 +541,67 @@ class YouTubeConverter(DocumentConverter):
return None return None
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not ipynb
extension = kwargs.get("file_extension", "")
if extension.lower() != ".ipynb":
return None
# Parse and convert the notebook
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
notebook_content = json.load(fh)
result = self._convert(notebook_content)
return result
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
"""Helper function that converts notebook JSON content to Markdown."""
try:
md_output = []
title = None
for cell in notebook_content.get("cells", []):
cell_type = cell.get("cell_type", "")
source_lines = cell.get("source", [])
if cell_type == "markdown":
md_output.append("".join(source_lines))
# Extract the first # heading as title if not already found
if title is None:
for line in source_lines:
if line.startswith("# "):
title = line.lstrip("# ").strip()
break
elif cell_type == "code":
# Code cells are wrapped in Markdown code blocks
md_output.append(f"```python\n{''.join(source_lines)}\n```")
elif cell_type == "raw":
md_output.append(f"```\n{''.join(source_lines)}\n```")
md_text = "\n\n".join(md_output)
# Check for title in notebook metadata
title = notebook_content.get("metadata", {}).get("title", title)
return DocumentConverterResult(
title=title,
text_content=md_text,
)
except Exception as e:
raise FileConversionException(
f"Error converting .ipynb file: {str(e)}"
) from e
class BingSerpConverter(DocumentConverter): class BingSerpConverter(DocumentConverter):
""" """
Handle Bing results pages (only the organic search results). Handle Bing results pages (only the organic search results).
@ -492,7 +703,9 @@ class DocxConverter(HtmlConverter):
result = None result = None
with open(local_path, "rb") as docx_file: with open(local_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file) style_map = kwargs.get("style_map", None)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value html_content = result.value
result = self._convert(html_content) result = self._convert(html_content)
@ -582,6 +795,10 @@ class PptxConverter(HtmlConverter):
"\n" + self._convert(html_table).text_content.strip() + "\n" "\n" + self._convert(html_table).text_content.strip() + "\n"
) )
# Charts
if shape.has_chart:
md_content += self._convert_chart_to_markdown(shape.chart)
# Text areas # Text areas
elif shape.has_text_frame: elif shape.has_text_frame:
if shape == title: if shape == title:
@ -616,6 +833,29 @@ class PptxConverter(HtmlConverter):
return True return True
return False return False
def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart"
if chart.has_title:
md += f": {chart.chart_title.text_frame.text}"
md += "\n\n"
data = []
category_names = [c.label for c in chart.plots[0].categories]
series_names = [s.name for s in chart.series]
data.append(["Category"] + series_names)
for idx, category in enumerate(category_names):
row = [category]
for series in chart.series:
row.append(series.values[idx])
data.append(row)
markdown_table = []
for row in data:
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
header = markdown_table[0]
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\n".join([header, separator] + markdown_table[1:])
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):
""" """
@ -642,7 +882,7 @@ class WavConverter(MediaConverter):
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not a WAV
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav": if extension.lower() != ".wav":
return None return None
@ -754,11 +994,11 @@ class Mp3Converter(WavConverter):
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
""" """
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not an image
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]: if extension.lower() not in [".jpg", ".jpeg", ".png"]:
return None return None
@ -784,17 +1024,17 @@ class ImageConverter(MediaConverter):
md_content += f"{f}: {metadata[f]}\n" md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPTV # Try describing the image with GPTV
mlm_client = kwargs.get("mlm_client") llm_client = kwargs.get("llm_client")
mlm_model = kwargs.get("mlm_model") llm_model = kwargs.get("llm_model")
if mlm_client is not None and mlm_model is not None: if llm_client is not None and llm_model is not None:
md_content += ( md_content += (
"\n# Description:\n" "\n# Description:\n"
+ self._get_mlm_description( + self._get_llm_description(
local_path, local_path,
extension, extension,
mlm_client, llm_client,
mlm_model, llm_model,
prompt=kwargs.get("mlm_prompt"), prompt=kwargs.get("llm_prompt"),
).strip() ).strip()
+ "\n" + "\n"
) )
@ -804,12 +1044,10 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_mlm_description(self, local_path, extension, client, model, prompt=None): def _get_llm_description(self, local_path, extension, client, model, prompt=None):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension) content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@ -837,6 +1075,124 @@ class ImageConverter(MediaConverter):
return response.choices[0].message.content return response.choices[0].message.content
class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
The converter extracts the ZIP contents to a temporary directory, processes each file
using appropriate converters based on file extensions, and then combines the results
into a single markdown document. The temporary directory is cleaned up after processing.
Example output format:
```markdown
Content from the zip file `example.zip`:
## File: docs/readme.txt
This is the content of readme.txt
Multiple lines are preserved
## File: images/example.jpg
ImageSize: 1920x1080
DateTimeOriginal: 2024-02-15 14:30:00
Description: A beautiful landscape photo
## File: data/report.xlsx
## Sheet1
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| data1 | data2 | data3 |
| data4 | data5 | data6 |
```
Key features:
- Maintains original file structure in headings
- Processes nested files recursively
- Uses appropriate converters for each file type
- Preserves formatting of converted content
- Cleans up temporary files after processing
"""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None
# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
)
extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
)
new_folder = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
)
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
# Safety check for path traversal
if not new_folder.startswith(os.path.dirname(local_path)):
return DocumentConverterResult(
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
)
try:
# Extract the zip file
with zipfile.ZipFile(local_path, "r") as zipObj:
zipObj.extractall(path=new_folder)
# Process each extracted file
for root, dirs, files in os.walk(new_folder):
for name in files:
file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, new_folder)
# Get file extension
_, file_extension = os.path.splitext(name)
# Update kwargs for the file
file_kwargs = kwargs.copy()
file_kwargs["file_extension"] = file_extension
file_kwargs["_parent_converters"] = parent_converters
# Try converting the file using available converters
for converter in parent_converters:
# Skip the zip converter to avoid infinite recursion
if isinstance(converter, ZipConverter):
continue
result = converter.convert(file_path, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"
break
# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(new_folder)
return DocumentConverterResult(title=None, text_content=md_content.strip())
except zipfile.BadZipFile:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@ -852,16 +1208,50 @@ class MarkItDown:
def __init__( def __init__(
self, self,
requests_session: Optional[requests.Session] = None, requests_session: Optional[requests.Session] = None,
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[Any] = None, mlm_model: Optional[str] = None,
): ):
if requests_session is None: if requests_session is None:
self._requests_session = requests.Session() self._requests_session = requests.Session()
else: else:
self._requests_session = requests_session self._requests_session = requests_session
self._mlm_client = mlm_client # Handle deprecation notices
self._mlm_model = mlm_model #############################
if mlm_client is not None:
if llm_client is None:
warn(
"'mlm_client' is deprecated, and was renamed 'llm_client'.",
DeprecationWarning,
)
llm_client = mlm_client
mlm_client = None
else:
raise ValueError(
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
)
if mlm_model is not None:
if llm_model is None:
warn(
"'mlm_model' is deprecated, and was renamed 'llm_model'.",
DeprecationWarning,
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client
self._llm_model = llm_model
self._style_map = style_map
self._page_converters: List[DocumentConverter] = [] self._page_converters: List[DocumentConverter] = []
@ -870,6 +1260,7 @@ class MarkItDown:
# To this end, the most specific converters should appear below the most generic converters # To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter()) self.register_page_converter(PlainTextConverter())
self.register_page_converter(HtmlConverter()) self.register_page_converter(HtmlConverter())
self.register_page_converter(RSSConverter())
self.register_page_converter(WikipediaConverter()) self.register_page_converter(WikipediaConverter())
self.register_page_converter(YouTubeConverter()) self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter()) self.register_page_converter(BingSerpConverter())
@ -879,7 +1270,9 @@ class MarkItDown:
self.register_page_converter(WavConverter()) self.register_page_converter(WavConverter())
self.register_page_converter(Mp3Converter()) self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter()) self.register_page_converter(ImageConverter())
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response], **kwargs: Any
@ -1003,7 +1396,7 @@ class MarkItDown:
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, url=response.url) result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@ -1030,11 +1423,17 @@ class MarkItDown:
_kwargs.update({"file_extension": ext}) _kwargs.update({"file_extension": ext})
# Copy any additional global options # Copy any additional global options
if "mlm_client" not in _kwargs and self._mlm_client is not None: if "llm_client" not in _kwargs and self._llm_client is not None:
_kwargs["mlm_client"] = self._mlm_client _kwargs["llm_client"] = self._llm_client
if "mlm_model" not in _kwargs and self._mlm_model is not None: if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["mlm_model"] = self._mlm_model _kwargs["llm_model"] = self._llm_model
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
try: try:
@ -1071,8 +1470,7 @@ class MarkItDown:
if ext == "": if ext == "":
return return
# if ext not in extensions: # if ext not in extensions:
if True: extensions.append(ext)
extensions.append(ext)
def _guess_ext_magic(self, path): def _guess_ext_magic(self, path):
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""

0
tests/test_files/test.docx vendored Executable file → Normal file
View file

0
tests/test_files/test.jpg vendored Executable file → Normal file
View file

Before

Width:  |  Height:  |  Size: 463 KiB

After

Width:  |  Height:  |  Size: 463 KiB

BIN
tests/test_files/test.pptx vendored Executable file → Normal file

Binary file not shown.

0
tests/test_files/test.xlsx vendored Executable file → Normal file
View file

BIN
tests/test_files/test_files.zip vendored Normal file

Binary file not shown.

BIN
tests/test_files/test_llm.jpg vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

4
tests/test_files/test_mskanji.csv vendored Normal file
View file

@ -0,0 +1,4 @@
¼O,”N—î,<2C>Z<EFBFBD>Š
<EFBFBD>²“¡¾˜Y,30,“Œ‹ž
ŽO؉pŽq,25,å<E28098>ã
îà´<EFBFBD>~,35,–¼ŒÃ‰®
1 –¼‘O ”N—î �Z�Š
2 �²“¡‘¾˜Y 30 “Œ‹ž
3 ŽO–؉pŽq 25 ‘å�ã
4 îà‹´�~ 35 –¼ŒÃ‰®

89
tests/test_files/test_notebook.ipynb vendored Normal file
View file

@ -0,0 +1,89 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0f61db80",
"metadata": {},
"source": [
"# Test Notebook"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3f2a5bbd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"markitdown\n"
]
}
],
"source": [
"print('markitdown')"
]
},
{
"cell_type": "markdown",
"id": "9b9c0468",
"metadata": {},
"source": [
"## Code Cell Below"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "37d8088a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"42\n"
]
}
],
"source": [
"# comment in code\n",
"print(42)"
]
},
{
"cell_type": "markdown",
"id": "2e3177bd",
"metadata": {},
"source": [
"End\n",
"\n",
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
},
"title": "Test Notebook Title"
},
"nbformat": 4,
"nbformat_minor": 5
}

1
tests/test_files/test_rss.xml vendored Normal file

File diff suppressed because one or more lines are too long

BIN
tests/test_files/test_with_comment.docx vendored Normal file

Binary file not shown.

View file

@ -6,11 +6,23 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI ) # Don't run these tests in CI
# Don't run the llm tests without a key and the client library
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
try:
import openai
except ModuleNotFoundError:
skip_llm = True
# Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None skip_exiftool = shutil.which("exiftool") is None
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@ -51,12 +63,25 @@ DOCX_TEST_STRINGS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
] ]
DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592", "1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
] ]
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@ -65,6 +90,13 @@ BLOG_TEST_STRINGS = [
"an example where high cost can easily prevent a generic complex", "an example where high cost can easily prevent a generic complex",
] ]
RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [ WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@ -87,6 +119,17 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D", "data:image/svg+xml,%3Csvg%20width%3D",
] ]
CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]
LLM_TEST_STRINGS = [
"5bda1dd6",
]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@ -130,6 +173,24 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test PPTX processing # Test PPTX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
for test_string in PPTX_TEST_STRINGS: for test_string in PPTX_TEST_STRINGS:
@ -144,6 +205,12 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test ZIP file processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test Wikipedia processing # Test Wikipedia processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
@ -164,6 +231,18 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS: for test_string in SERP_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
# Test RSS processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
assert test_string in text_content
## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
text_content = result.text_content.replace("\\", "")
for test_string in CSV_CP932_TEST_STRINGS:
assert test_string in text_content
@pytest.mark.skipif( @pytest.mark.skipif(
skip_exiftool, skip_exiftool,
@ -179,8 +258,63 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
)
def test_markitdown_llm() -> None:
client = openai.OpenAI()
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
for test_string in LLM_TEST_STRINGS:
assert test_string in result.text_content
# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()