Merge branch 'main' into hly/chore/xlsx
This commit is contained in:
commit
b10b295fb4
21 changed files with 732 additions and 166 deletions
32
.devcontainer/devcontainer.json
Normal file
32
.devcontainer/devcontainer.json
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
|
||||
{
|
||||
"name": "Existing Dockerfile",
|
||||
"build": {
|
||||
// Sets the run context to one level up instead of the .devcontainer folder.
|
||||
"context": "..",
|
||||
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
||||
"dockerfile": "../Dockerfile",
|
||||
"args": {
|
||||
"INSTALL_GIT": "true"
|
||||
}
|
||||
},
|
||||
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {},
|
||||
"features": {
|
||||
"ghcr.io/devcontainers-extra/features/hatch:2": {}
|
||||
},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [],
|
||||
|
||||
// Uncomment the next line to run commands after the container is created.
|
||||
// "postCreateCommand": "cat /etc/os-release",
|
||||
|
||||
// Configure tool-specific properties.
|
||||
// "customizations": {},
|
||||
|
||||
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
|
||||
"remoteUser": "root"
|
||||
}
|
||||
1
.dockerignore
Normal file
1
.dockerignore
Normal file
|
|
@ -0,0 +1 @@
|
|||
*
|
||||
6
.github/dependabot.yml
vendored
Normal file
6
.github/dependabot.yml
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
|
|
@ -5,9 +5,9 @@ jobs:
|
|||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.x"
|
||||
|
||||
|
|
|
|||
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
|
|
@ -5,8 +5,8 @@ jobs:
|
|||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: |
|
||||
3.10
|
||||
|
|
@ -14,7 +14,7 @@ jobs:
|
|||
3.12
|
||||
- name: Set up pip cache
|
||||
if: runner.os == 'Linux'
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
|
||||
|
|
|
|||
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -1,3 +1,5 @@
|
|||
.vscode
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
|
@ -160,3 +162,5 @@ cython_debug/
|
|||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
src/.DS_Store
|
||||
.DS_Store
|
||||
|
|
|
|||
23
Dockerfile
Normal file
23
Dockerfile
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
FROM python:3.13-slim-bullseye
|
||||
|
||||
USER root
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install markitdown
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
ENTRYPOINT [ "markitdown" ]
|
||||
164
README.md
164
README.md
|
|
@ -1,83 +1,113 @@
|
|||
> [!IMPORTANT]
|
||||
> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
|
||||
|
||||
# MarkItDown
|
||||
|
||||
[](https://pypi.org/project/markitdown/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
|
||||
|
||||
It presently supports:
|
||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||
It supports:
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- Word
|
||||
- Excel
|
||||
- Images (EXIF metadata and OCR)
|
||||
- Audio (EXIF metadata and speech transcription)
|
||||
- HTML
|
||||
- Text-based formats (CSV, JSON, XML)
|
||||
- ZIP files (iterates over contents)
|
||||
|
||||
- PDF (.pdf)
|
||||
- PowerPoint (.pptx)
|
||||
- Word (.docx)
|
||||
- Excel (.xlsx)
|
||||
- Images (EXIF metadata, and OCR)
|
||||
- Audio (EXIF metadata, and speech transcription)
|
||||
- HTML (special handling of Wikipedia, etc.)
|
||||
- Various other text-based formats (csv, json, xml, etc.)
|
||||
- ZIP (Iterates over contents and converts each file)
|
||||
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
|
||||
|
||||
# Installation
|
||||
## Usage
|
||||
|
||||
You can install `markitdown` using pip:
|
||||
|
||||
```python
|
||||
pip install markitdown
|
||||
```
|
||||
|
||||
or from the source
|
||||
|
||||
```sh
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
# Usage
|
||||
The API is simple:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
To use this as a command-line utility, install it and then run it like this:
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf
|
||||
```
|
||||
|
||||
This will output Markdown to standard output. You can save it like this:
|
||||
### Command-Line
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf > document.md
|
||||
```
|
||||
|
||||
You can pipe content to standard input by omitting the argument:
|
||||
Or use `-o` to specify the output file:
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf -o document.md
|
||||
```
|
||||
|
||||
You can also pipe content:
|
||||
|
||||
```bash
|
||||
cat path-to-file.pdf | markitdown
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `mlm_client` and `mlm_model` parameters to MarkItDown object, according to your specific client.
|
||||
Basic usage in Python:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI()
|
||||
md = MarkItDown(mlm_client=client, mlm_model="gpt-4o")
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
result = md.convert("example.jpg")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
The prompt of describing images can be customized by providing `mlm_prompt` parameter.
|
||||
### Docker
|
||||
|
||||
```python
|
||||
# ...
|
||||
result = md.convert("example.jpg", mlm_prompt="Customized prompt")
|
||||
```sh
|
||||
docker build -t markitdown:latest .
|
||||
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||
```
|
||||
<details>
|
||||
|
||||
<summary>Batch Processing Multiple Files</summary>
|
||||
|
||||
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
|
||||
|
||||
|
||||
```python convert.py
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
import os
|
||||
client = OpenAI(api_key="your-api-key-here")
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
|
||||
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
|
||||
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
|
||||
for file in files_to_convert:
|
||||
print(f"\nConverting {file}...")
|
||||
try:
|
||||
md_file = os.path.splitext(file)[0] + '.md'
|
||||
result = md.convert(file)
|
||||
with open(md_file, 'w') as f:
|
||||
f.write(result.text_content)
|
||||
|
||||
print(f"Successfully converted {file} to {md_file}")
|
||||
except Exception as e:
|
||||
print(f"Error converting {file}: {str(e)}")
|
||||
|
||||
print("\nAll conversions completed!")
|
||||
```
|
||||
2. Place the script in the same directory as your files
|
||||
3. Install required packages: like openai
|
||||
4. Run script ```bash python convert.py ```
|
||||
|
||||
Note that original files will remain unchanged and new markdown files are created with the same base name.
|
||||
|
||||
</details>
|
||||
|
||||
## Contributing
|
||||
|
||||
|
|
@ -93,20 +123,36 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
|
|||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
||||
### Running Tests
|
||||
### How to Contribute
|
||||
|
||||
To run the tests for this project, use the following command:
|
||||
You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
|
||||
|
||||
```sh
|
||||
hatch shell
|
||||
hatch test
|
||||
```
|
||||
|
||||
### Running Pre-commit Checks
|
||||
<div align="center">
|
||||
|
||||
```sh
|
||||
pre-commit run --all-files
|
||||
```
|
||||
| | All | Especially Needs Help from Community |
|
||||
|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
|
||||
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
|
||||
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
|
||||
|
||||
</div>
|
||||
|
||||
### Running Tests and Checks
|
||||
|
||||
- Install `hatch` in your environment and run tests:
|
||||
```sh
|
||||
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
|
||||
hatch shell
|
||||
hatch test
|
||||
```
|
||||
|
||||
(Alternative) Use the Devcontainer which has all the dependencies installed:
|
||||
```sh
|
||||
# Reopen the project in Devcontainer and run:
|
||||
hatch test
|
||||
```
|
||||
|
||||
- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
|
||||
|
||||
## Trademarks
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|||
[project]
|
||||
name = "markitdown"
|
||||
dynamic = ["version"]
|
||||
description = ''
|
||||
description = 'Utility tool for converting various files to Markdown'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
|
|
@ -39,6 +39,7 @@ dependencies = [
|
|||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
"openai",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
|
@ -77,3 +78,6 @@ exclude_lines = [
|
|||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown"]
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.0.1a1"
|
||||
__version__ = "0.0.1a3"
|
||||
|
|
|
|||
|
|
@ -1,41 +1,81 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
import argparse
|
||||
import sys
|
||||
from ._markitdown import MarkItDown
|
||||
from textwrap import dedent
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown, DocumentConverterResult
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 1:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert various file formats to markdown.",
|
||||
prog="markitdown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
usage=dedent(
|
||||
"""
|
||||
SYNTAX:
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitdown reads from stdin.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
markitdown example.pdf
|
||||
|
||||
OR
|
||||
|
||||
cat example.pdf | markitdown
|
||||
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
|
||||
OR to save to a file use
|
||||
|
||||
markitdown example.pdf -o example.md
|
||||
|
||||
OR
|
||||
|
||||
markitdown example.pdf > example.md
|
||||
"""
|
||||
).strip(),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version=f"%(prog)s {__version__}",
|
||||
help="show the version number and exit",
|
||||
)
|
||||
|
||||
parser.add_argument("filename", nargs="?")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Output file name. If not provided, output is written to stdout.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.filename is None:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||
print(result.text_content)
|
||||
elif len(sys.argv) == 2:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(sys.argv[1])
|
||||
print(result.text_content)
|
||||
_handle_output(args, result)
|
||||
else:
|
||||
sys.stderr.write(
|
||||
"""
|
||||
SYNTAX:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(args.filename)
|
||||
_handle_output(args, result)
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitdown reads from stdin.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
markitdown example.pdf
|
||||
|
||||
OR
|
||||
|
||||
cat example.pdf | markitdown
|
||||
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
""".strip()
|
||||
+ "\n"
|
||||
)
|
||||
def _handle_output(args, result: DocumentConverterResult):
|
||||
"""Handle output to stdout or file"""
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(result.text_content)
|
||||
else:
|
||||
print(result.text_content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -13,9 +13,11 @@ import sys
|
|||
import tempfile
|
||||
import traceback
|
||||
import zipfile
|
||||
from xml.dom import minidom
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from warnings import catch_warnings
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
|
||||
import mammoth
|
||||
import markdownify
|
||||
|
|
@ -44,6 +46,8 @@ try:
|
|||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
finally:
|
||||
resetwarnings()
|
||||
|
||||
# Optional YouTube transcription support
|
||||
try:
|
||||
|
|
@ -226,6 +230,143 @@ class HtmlConverter(DocumentConverter):
|
|||
)
|
||||
|
||||
|
||||
class RSSConverter(DocumentConverter):
|
||||
"""Convert RSS / Atom type to markdown"""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not RSS type
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
||||
return None
|
||||
try:
|
||||
doc = minidom.parse(local_path)
|
||||
except BaseException as _:
|
||||
return None
|
||||
result = None
|
||||
if doc.getElementsByTagName("rss"):
|
||||
# A RSS feed must have a root element of <rss>
|
||||
result = self._parse_rss_type(doc)
|
||||
elif doc.getElementsByTagName("feed"):
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
if root.getElementsByTagName("entry"):
|
||||
# An Atom feed must have a root element of <feed> and at least one <entry>
|
||||
result = self._parse_atom_type(doc)
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
# not rss or atom
|
||||
return None
|
||||
|
||||
return result
|
||||
|
||||
def _parse_atom_type(
|
||||
self, doc: minidom.Document
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
"""Parse the type of an Atom feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
try:
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
title = self._get_data_by_tag_name(root, "title")
|
||||
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
||||
entries = root.getElementsByTagName("entry")
|
||||
md_text = f"# {title}\n"
|
||||
if subtitle:
|
||||
md_text += f"{subtitle}\n"
|
||||
for entry in entries:
|
||||
entry_title = self._get_data_by_tag_name(entry, "title")
|
||||
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
||||
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
||||
entry_content = self._get_data_by_tag_name(entry, "content")
|
||||
|
||||
if entry_title:
|
||||
md_text += f"\n## {entry_title}\n"
|
||||
if entry_updated:
|
||||
md_text += f"Updated on: {entry_updated}\n"
|
||||
if entry_summary:
|
||||
md_text += self._parse_content(entry_summary)
|
||||
if entry_content:
|
||||
md_text += self._parse_content(entry_content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=title,
|
||||
text_content=md_text,
|
||||
)
|
||||
except BaseException as _:
|
||||
return None
|
||||
|
||||
def _parse_rss_type(
|
||||
self, doc: minidom.Document
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
"""Parse the type of an RSS feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
try:
|
||||
root = doc.getElementsByTagName("rss")[0]
|
||||
channel = root.getElementsByTagName("channel")
|
||||
if not channel:
|
||||
return None
|
||||
channel = channel[0]
|
||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||
items = channel.getElementsByTagName("item")
|
||||
if channel_title:
|
||||
md_text = f"# {channel_title}\n"
|
||||
if channel_description:
|
||||
md_text += f"{channel_description}\n"
|
||||
if not items:
|
||||
items = []
|
||||
for item in items:
|
||||
title = self._get_data_by_tag_name(item, "title")
|
||||
description = self._get_data_by_tag_name(item, "description")
|
||||
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
||||
content = self._get_data_by_tag_name(item, "content:encoded")
|
||||
|
||||
if title:
|
||||
md_text += f"\n## {title}\n"
|
||||
if pubDate:
|
||||
md_text += f"Published on: {pubDate}\n"
|
||||
if description:
|
||||
md_text += self._parse_content(description)
|
||||
if content:
|
||||
md_text += self._parse_content(content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=channel_title,
|
||||
text_content=md_text,
|
||||
)
|
||||
except BaseException as _:
|
||||
print(traceback.format_exc())
|
||||
return None
|
||||
|
||||
def _parse_content(self, content: str) -> str:
|
||||
"""Parse the content of an RSS feed item"""
|
||||
try:
|
||||
# using bs4 because many RSS feeds have HTML-styled content
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
return _CustomMarkdownify().convert_soup(soup)
|
||||
except BaseException as _:
|
||||
return content
|
||||
|
||||
def _get_data_by_tag_name(
|
||||
self, element: minidom.Element, tag_name: str
|
||||
) -> Union[str, None]:
|
||||
"""Get data from first child element with the given tag name.
|
||||
Returns None when no such element is found.
|
||||
"""
|
||||
nodes = element.getElementsByTagName(tag_name)
|
||||
if not nodes:
|
||||
return None
|
||||
fc = nodes[0].firstChild
|
||||
if fc:
|
||||
return fc.data
|
||||
return None
|
||||
|
||||
|
||||
class WikipediaConverter(DocumentConverter):
|
||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||
|
||||
|
|
@ -407,6 +548,67 @@ class YouTubeConverter(DocumentConverter):
|
|||
return None
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not ipynb
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".ipynb":
|
||||
return None
|
||||
|
||||
# Parse and convert the notebook
|
||||
result = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
notebook_content = json.load(fh)
|
||||
result = self._convert(notebook_content)
|
||||
|
||||
return result
|
||||
|
||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
title = None
|
||||
|
||||
for cell in notebook_content.get("cells", []):
|
||||
cell_type = cell.get("cell_type", "")
|
||||
source_lines = cell.get("source", [])
|
||||
|
||||
if cell_type == "markdown":
|
||||
md_output.append("".join(source_lines))
|
||||
|
||||
# Extract the first # heading as title if not already found
|
||||
if title is None:
|
||||
for line in source_lines:
|
||||
if line.startswith("# "):
|
||||
title = line.lstrip("# ").strip()
|
||||
break
|
||||
|
||||
elif cell_type == "code":
|
||||
# Code cells are wrapped in Markdown code blocks
|
||||
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
||||
elif cell_type == "raw":
|
||||
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
||||
|
||||
md_text = "\n\n".join(md_output)
|
||||
|
||||
# Check for title in notebook metadata
|
||||
title = notebook_content.get("metadata", {}).get("title", title)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=title,
|
||||
text_content=md_text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Error converting .ipynb file: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
class BingSerpConverter(DocumentConverter):
|
||||
"""
|
||||
Handle Bing results pages (only the organic search results).
|
||||
|
|
@ -715,7 +917,7 @@ class WavConverter(MediaConverter):
|
|||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLSX
|
||||
# Bail if not a WAV
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".wav":
|
||||
return None
|
||||
|
|
@ -827,11 +1029,11 @@ class Mp3Converter(WavConverter):
|
|||
|
||||
class ImageConverter(MediaConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLSX
|
||||
# Bail if not an image
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||
return None
|
||||
|
|
@ -857,17 +1059,17 @@ class ImageConverter(MediaConverter):
|
|||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Try describing the image with GPTV
|
||||
mlm_client = kwargs.get("mlm_client")
|
||||
mlm_model = kwargs.get("mlm_model")
|
||||
if mlm_client is not None and mlm_model is not None:
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
md_content += (
|
||||
"\n# Description:\n"
|
||||
+ self._get_mlm_description(
|
||||
+ self._get_llm_description(
|
||||
local_path,
|
||||
extension,
|
||||
mlm_client,
|
||||
mlm_model,
|
||||
prompt=kwargs.get("mlm_prompt"),
|
||||
llm_client,
|
||||
llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
).strip()
|
||||
+ "\n"
|
||||
)
|
||||
|
|
@ -877,12 +1079,10 @@ class ImageConverter(MediaConverter):
|
|||
text_content=md_content,
|
||||
)
|
||||
|
||||
def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
|
||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
||||
|
||||
data_uri = ""
|
||||
with open(local_path, "rb") as image_file:
|
||||
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
||||
|
|
@ -968,27 +1168,33 @@ class ZipConverter(DocumentConverter):
|
|||
extracted_zip_folder_name = (
|
||||
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
|
||||
)
|
||||
new_folder = os.path.normpath(
|
||||
extraction_dir = os.path.normpath(
|
||||
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
|
||||
)
|
||||
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
|
||||
|
||||
# Safety check for path traversal
|
||||
if not new_folder.startswith(os.path.dirname(local_path)):
|
||||
return DocumentConverterResult(
|
||||
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Extract the zip file
|
||||
# Extract the zip file safely
|
||||
with zipfile.ZipFile(local_path, "r") as zipObj:
|
||||
zipObj.extractall(path=new_folder)
|
||||
# Safeguard against path traversal
|
||||
for member in zipObj.namelist():
|
||||
member_path = os.path.normpath(os.path.join(extraction_dir, member))
|
||||
if (
|
||||
not os.path.commonprefix([extraction_dir, member_path])
|
||||
== extraction_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Path traversal detected in zip file: {member}"
|
||||
)
|
||||
|
||||
# Extract all files safely
|
||||
zipObj.extractall(path=extraction_dir)
|
||||
|
||||
# Process each extracted file
|
||||
for root, dirs, files in os.walk(new_folder):
|
||||
for root, dirs, files in os.walk(extraction_dir):
|
||||
for name in files:
|
||||
file_path = os.path.join(root, name)
|
||||
relative_path = os.path.relpath(file_path, new_folder)
|
||||
relative_path = os.path.relpath(file_path, extraction_dir)
|
||||
|
||||
# Get file extension
|
||||
_, file_extension = os.path.splitext(name)
|
||||
|
|
@ -1012,7 +1218,7 @@ class ZipConverter(DocumentConverter):
|
|||
|
||||
# Clean up extracted files if specified
|
||||
if kwargs.get("cleanup_extracted", True):
|
||||
shutil.rmtree(new_folder)
|
||||
shutil.rmtree(extraction_dir)
|
||||
|
||||
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
||||
|
||||
|
|
@ -1021,6 +1227,11 @@ class ZipConverter(DocumentConverter):
|
|||
title=None,
|
||||
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
||||
)
|
||||
except ValueError as ve:
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
|
||||
)
|
||||
except Exception as e:
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
|
|
@ -1043,17 +1254,49 @@ class MarkItDown:
|
|||
def __init__(
|
||||
self,
|
||||
requests_session: Optional[requests.Session] = None,
|
||||
mlm_client: Optional[Any] = None,
|
||||
mlm_model: Optional[Any] = None,
|
||||
llm_client: Optional[Any] = None,
|
||||
llm_model: Optional[str] = None,
|
||||
style_map: Optional[str] = None,
|
||||
# Deprecated
|
||||
mlm_client: Optional[Any] = None,
|
||||
mlm_model: Optional[str] = None,
|
||||
):
|
||||
if requests_session is None:
|
||||
self._requests_session = requests.Session()
|
||||
else:
|
||||
self._requests_session = requests_session
|
||||
|
||||
self._mlm_client = mlm_client
|
||||
self._mlm_model = mlm_model
|
||||
# Handle deprecation notices
|
||||
#############################
|
||||
if mlm_client is not None:
|
||||
if llm_client is None:
|
||||
warn(
|
||||
"'mlm_client' is deprecated, and was renamed 'llm_client'.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
llm_client = mlm_client
|
||||
mlm_client = None
|
||||
else:
|
||||
raise ValueError(
|
||||
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
|
||||
)
|
||||
|
||||
if mlm_model is not None:
|
||||
if llm_model is None:
|
||||
warn(
|
||||
"'mlm_model' is deprecated, and was renamed 'llm_model'.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
llm_model = mlm_model
|
||||
mlm_model = None
|
||||
else:
|
||||
raise ValueError(
|
||||
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
|
||||
)
|
||||
#############################
|
||||
|
||||
self._llm_client = llm_client
|
||||
self._llm_model = llm_model
|
||||
self._style_map = style_map
|
||||
|
||||
self._page_converters: List[DocumentConverter] = []
|
||||
|
|
@ -1063,6 +1306,7 @@ class MarkItDown:
|
|||
# To this end, the most specific converters should appear below the most generic converters
|
||||
self.register_page_converter(PlainTextConverter())
|
||||
self.register_page_converter(HtmlConverter())
|
||||
self.register_page_converter(RSSConverter())
|
||||
self.register_page_converter(WikipediaConverter())
|
||||
self.register_page_converter(YouTubeConverter())
|
||||
self.register_page_converter(BingSerpConverter())
|
||||
|
|
@ -1072,15 +1316,16 @@ class MarkItDown:
|
|||
self.register_page_converter(WavConverter())
|
||||
self.register_page_converter(Mp3Converter())
|
||||
self.register_page_converter(ImageConverter())
|
||||
self.register_page_converter(IpynbConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(ZipConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
- source: can be a string representing a path or url, or a requests.response object
|
||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||
"""
|
||||
|
||||
|
|
@ -1097,10 +1342,14 @@ class MarkItDown:
|
|||
# Request response
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, **kwargs)
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, **kwargs)
|
||||
|
||||
def convert_local(
|
||||
self, path: str, **kwargs: Any
|
||||
self, path: Union[str, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
if isinstance(path, Path):
|
||||
path = str(path)
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
extensions = [ext] if ext is not None else []
|
||||
|
|
@ -1224,11 +1473,12 @@ class MarkItDown:
|
|||
_kwargs.update({"file_extension": ext})
|
||||
|
||||
# Copy any additional global options
|
||||
if "mlm_client" not in _kwargs and self._mlm_client is not None:
|
||||
_kwargs["mlm_client"] = self._mlm_client
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
_kwargs["llm_client"] = self._llm_client
|
||||
|
||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||
_kwargs["llm_model"] = self._llm_model
|
||||
|
||||
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
||||
_kwargs["mlm_model"] = self._mlm_model
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._page_converters
|
||||
|
||||
|
|
@ -1270,8 +1520,7 @@ class MarkItDown:
|
|||
if ext == "":
|
||||
return
|
||||
# if ext not in extensions:
|
||||
if True:
|
||||
extensions.append(ext)
|
||||
extensions.append(ext)
|
||||
|
||||
def _guess_ext_magic(self, path):
|
||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
||||
|
|
|
|||
0
src/markitdown/py.typed
Normal file
0
src/markitdown/py.typed
Normal file
0
tests/test_files/test.docx
vendored
Executable file → Normal file
0
tests/test_files/test.docx
vendored
Executable file → Normal file
0
tests/test_files/test.jpg
vendored
Executable file → Normal file
0
tests/test_files/test.jpg
vendored
Executable file → Normal file
|
Before Width: | Height: | Size: 463 KiB After Width: | Height: | Size: 463 KiB |
0
tests/test_files/test.xlsx
vendored
Executable file → Normal file
0
tests/test_files/test.xlsx
vendored
Executable file → Normal file
BIN
tests/test_files/test_llm.jpg
vendored
Normal file
BIN
tests/test_files/test_llm.jpg
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 145 KiB |
89
tests/test_files/test_notebook.ipynb
vendored
Normal file
89
tests/test_files/test_notebook.ipynb
vendored
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f61db80",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test Notebook"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "3f2a5bbd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"markitdown\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('markitdown')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9b9c0468",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Code Cell Below"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "37d8088a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"42\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# comment in code\n",
|
||||
"print(42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2e3177bd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"End\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
},
|
||||
"title": "Test Notebook Title"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
1
tests/test_files/test_rss.xml
vendored
Normal file
1
tests/test_files/test_rss.xml
vendored
Normal file
File diff suppressed because one or more lines are too long
0
tests/test_files/test_with_comment.docx
vendored
Executable file → Normal file
0
tests/test_files/test_with_comment.docx
vendored
Executable file → Normal file
|
|
@ -6,11 +6,23 @@ import shutil
|
|||
import pytest
|
||||
import requests
|
||||
|
||||
from warnings import catch_warnings, resetwarnings
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
skip_remote = (
|
||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||
) # Don't run these tests in CI
|
||||
|
||||
|
||||
# Don't run the llm tests without a key and the client library
|
||||
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
|
||||
try:
|
||||
import openai
|
||||
except ModuleNotFoundError:
|
||||
skip_llm = True
|
||||
|
||||
# Skip exiftool tests if not installed
|
||||
skip_exiftool = shutil.which("exiftool") is None
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
|
@ -79,6 +91,13 @@ BLOG_TEST_STRINGS = [
|
|||
"an example where high cost can easily prevent a generic complex",
|
||||
]
|
||||
|
||||
|
||||
RSS_TEST_STRINGS = [
|
||||
"The Official Microsoft Blog",
|
||||
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
|
||||
]
|
||||
|
||||
|
||||
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
|
||||
WIKIPEDIA_TEST_STRINGS = [
|
||||
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
|
||||
|
|
@ -108,6 +127,21 @@ CSV_CP932_TEST_STRINGS = [
|
|||
"髙橋淳,35,名古屋",
|
||||
]
|
||||
|
||||
LLM_TEST_STRINGS = [
|
||||
"5bda1dd6",
|
||||
]
|
||||
|
||||
|
||||
# --- Helper Functions ---
|
||||
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||
"""Validate presence or absence of specific strings."""
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for string in expected_strings:
|
||||
assert string in text_content
|
||||
if exclude_strings:
|
||||
for string in exclude_strings:
|
||||
assert string not in text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
|
|
@ -141,11 +175,7 @@ def test_markitdown_local() -> None:
|
|||
|
||||
# Test XLSX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
|
||||
# Check assertions
|
||||
for test_string in XLSX_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
validate_strings(result, XLSX_TEST_STRINGS)
|
||||
|
||||
# Check negations
|
||||
assert "Unnamed:" not in text_content
|
||||
|
|
@ -153,73 +183,59 @@ def test_markitdown_local() -> None:
|
|||
|
||||
# Test DOCX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
||||
for test_string in DOCX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
validate_strings(result, DOCX_TEST_STRINGS)
|
||||
|
||||
# Test DOCX processing, with comments
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
|
||||
style_map="comment-reference => ",
|
||||
)
|
||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||
|
||||
# Test DOCX processing, with comments and setting style_map on init
|
||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||
result = markitdown_with_style_map.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
|
||||
)
|
||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||
|
||||
# Test PPTX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
||||
for test_string in PPTX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
validate_strings(result, PPTX_TEST_STRINGS)
|
||||
|
||||
# Test HTML processing
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
||||
)
|
||||
for test_string in BLOG_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
validate_strings(result, BLOG_TEST_STRINGS)
|
||||
|
||||
# Test ZIP file processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||
for test_string in DOCX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
validate_strings(result, XLSX_TEST_STRINGS)
|
||||
|
||||
# Test Wikipedia processing
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
||||
)
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in WIKIPEDIA_TEST_EXCLUDES:
|
||||
assert test_string not in text_content
|
||||
for test_string in WIKIPEDIA_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||
|
||||
# Test Bing processing
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
|
||||
)
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in SERP_TEST_EXCLUDES:
|
||||
assert test_string not in text_content
|
||||
for test_string in SERP_TEST_STRINGS:
|
||||
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||
|
||||
# Test RSS processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in RSS_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
## Test non-UTF-8 encoding
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in CSV_CP932_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
|
|
@ -236,8 +252,63 @@ def test_markitdown_exiftool() -> None:
|
|||
assert target in result.text_content
|
||||
|
||||
|
||||
def test_markitdown_deprecation() -> None:
|
||||
try:
|
||||
with catch_warnings(record=True) as w:
|
||||
test_client = object()
|
||||
markitdown = MarkItDown(mlm_client=test_client)
|
||||
assert len(w) == 1
|
||||
assert w[0].category is DeprecationWarning
|
||||
assert markitdown._llm_client == test_client
|
||||
finally:
|
||||
resetwarnings()
|
||||
|
||||
try:
|
||||
with catch_warnings(record=True) as w:
|
||||
markitdown = MarkItDown(mlm_model="gpt-4o")
|
||||
assert len(w) == 1
|
||||
assert w[0].category is DeprecationWarning
|
||||
assert markitdown._llm_model == "gpt-4o"
|
||||
finally:
|
||||
resetwarnings()
|
||||
|
||||
try:
|
||||
test_client = object()
|
||||
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
|
||||
assert False
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
|
||||
assert False
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_llm,
|
||||
reason="do not run llm tests without a key",
|
||||
)
|
||||
def test_markitdown_llm() -> None:
|
||||
client = openai.OpenAI()
|
||||
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
|
||||
|
||||
for test_string in LLM_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
# This is not super precise. It would also accept "red square", "blue circle",
|
||||
# "the square is not blue", etc. But it's sufficient for this test.
|
||||
for test_string in ["red", "circle", "blue", "square"]:
|
||||
assert test_string in result.text_content.lower()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_markitdown_remote()
|
||||
test_markitdown_local()
|
||||
test_markitdown_exiftool()
|
||||
test_markitdown_deprecation()
|
||||
test_markitdown_llm()
|
||||
|
|
|
|||
Loading…
Reference in a new issue