Merge pull request #1 from pathintegral-institute/rong/tech-135-markitup-cleanup
Rong/tech 135 markitup cleanup
This commit is contained in:
commit
e729da2b38
109 changed files with 1367 additions and 4804 deletions
|
|
@ -1,32 +0,0 @@
|
|||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
|
||||
{
|
||||
"name": "Existing Dockerfile",
|
||||
"build": {
|
||||
// Sets the run context to one level up instead of the .devcontainer folder.
|
||||
"context": "..",
|
||||
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
||||
"dockerfile": "../Dockerfile",
|
||||
"args": {
|
||||
"INSTALL_GIT": "true"
|
||||
}
|
||||
},
|
||||
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {},
|
||||
"features": {
|
||||
"ghcr.io/devcontainers-extra/features/hatch:2": {}
|
||||
},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [],
|
||||
|
||||
// Uncomment the next line to run commands after the container is created.
|
||||
// "postCreateCommand": "cat /etc/os-release",
|
||||
|
||||
// Configure tool-specific properties.
|
||||
// "customizations": {},
|
||||
|
||||
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
|
||||
"remoteUser": "root"
|
||||
}
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
*
|
||||
!packages/
|
||||
2
.gitattributes
vendored
2
.gitattributes
vendored
|
|
@ -1,2 +0,0 @@
|
|||
packages/markitdown/tests/test_files/** linguist-vendored
|
||||
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||
6
.github/dependabot.yml
vendored
6
.github/dependabot.yml
vendored
|
|
@ -1,6 +0,0 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
20
.github/workflows/pre-commit.yml
vendored
20
.github/workflows/pre-commit.yml
vendored
|
|
@ -1,20 +0,0 @@
|
|||
name: pre-commit
|
||||
on: [pull_request]
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.x"
|
||||
|
||||
- name: Install pre-commit
|
||||
run: |
|
||||
pip install pre-commit
|
||||
pre-commit install --install-hooks
|
||||
|
||||
- name: Run pre-commit
|
||||
run: pre-commit run --all-files
|
||||
18
.github/workflows/tests.yml
vendored
18
.github/workflows/tests.yml
vendored
|
|
@ -1,18 +0,0 @@
|
|||
name: tests
|
||||
on: [pull_request]
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: |
|
||||
3.10
|
||||
3.11
|
||||
3.12
|
||||
- name: Install Hatch
|
||||
run: pipx install hatch
|
||||
- name: Run tests
|
||||
run: cd packages/markitdown; hatch test
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -4,7 +4,7 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
working/
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +0,0 @@
|
|||
repos:
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.7.0 # Use the latest version of Black
|
||||
hooks:
|
||||
- id: black
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
# Microsoft Open Source Code of Conduct
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
|
||||
Resources:
|
||||
|
||||
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
|
||||
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
|
||||
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
|
||||
33
Dockerfile
33
Dockerfile
|
|
@ -1,33 +0,0 @@
|
|||
FROM python:3.13-slim-bullseye
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
exiftool
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git; \
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
/app/packages/markitdown-sample-plugin
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=nobody
|
||||
ARG GROUPID=nogroup
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
ENTRYPOINT [ "markitdown" ]
|
||||
224
README.md
224
README.md
|
|
@ -1,223 +1,9 @@
|
|||
# MarkItDown
|
||||
# MarkItUp
|
||||
|
||||
[](https://pypi.org/project/markitdown/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
This is a fork of [MarkItDown](https://github.com/microsoft/markitdown).
|
||||
|
||||
> [!TIP]
|
||||
> MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
|
||||
While markitup is a useful tool, its returned content is too text-focused, which is not updated to the current rise of multi-modal LLMs.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Breaking changes between 0.0.1 to 0.1.0:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior.
|
||||
> * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
|
||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||
## Features
|
||||
|
||||
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||
|
||||
At present, MarkItDown supports:
|
||||
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- Word
|
||||
- Excel
|
||||
- Images (EXIF metadata and OCR)
|
||||
- Audio (EXIF metadata and speech transcription)
|
||||
- HTML
|
||||
- Text-based formats (CSV, JSON, XML)
|
||||
- ZIP files (iterates over contents)
|
||||
- Youtube URLs
|
||||
- EPubs
|
||||
- ... and more!
|
||||
|
||||
## Why Markdown?
|
||||
|
||||
Markdown is extremely close to plain text, with minimal markup or formatting, but still
|
||||
provides a way to represent important document structure. Mainstream LLMs, such as
|
||||
OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their
|
||||
responses unprompted. This suggests that they have been trained on vast amounts of
|
||||
Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions
|
||||
are also highly token-efficient.
|
||||
|
||||
## Installation
|
||||
|
||||
To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e 'packages/markitdown[all]'
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command-Line
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf > document.md
|
||||
```
|
||||
|
||||
Or use `-o` to specify the output file:
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf -o document.md
|
||||
```
|
||||
|
||||
You can also pipe content:
|
||||
|
||||
```bash
|
||||
cat path-to-file.pdf | markitdown
|
||||
```
|
||||
|
||||
### Optional Dependencies
|
||||
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
|
||||
|
||||
```bash
|
||||
pip install 'markitdown[pdf, docx, pptx]'
|
||||
```
|
||||
|
||||
will install only the dependencies for PDF, DOCX, and PPTX files.
|
||||
|
||||
At the moment, the following optional dependencies are available:
|
||||
|
||||
* `[all]` Installs all optional dependencies
|
||||
* `[pptx]` Installs dependencies for PowerPoint files
|
||||
* `[docx]` Installs dependencies for Word files
|
||||
* `[xlsx]` Installs dependencies for Excel files
|
||||
* `[xls]` Installs dependencies for older Excel files
|
||||
* `[pdf]` Installs dependencies for PDF files
|
||||
* `[outlook]` Installs dependencies for Outlook messages
|
||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||
|
||||
### Plugins
|
||||
|
||||
MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins:
|
||||
|
||||
```bash
|
||||
markitdown --list-plugins
|
||||
```
|
||||
|
||||
To enable plugins use:
|
||||
|
||||
```bash
|
||||
markitdown --use-plugins path-to-file.pdf
|
||||
```
|
||||
|
||||
To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. To develop a plugin, see `packages/markitdown-sample-plugin`.
|
||||
|
||||
### Azure Document Intelligence
|
||||
|
||||
To use Microsoft Document Intelligence for conversion:
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
|
||||
```
|
||||
|
||||
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
|
||||
|
||||
### Python API
|
||||
|
||||
Basic usage in Python:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
|
||||
result = md.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
Document Intelligence conversion in Python:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
|
||||
result = md.convert("test.pdf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI()
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
result = md.convert("example.jpg")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
```sh
|
||||
docker build -t markitdown:latest .
|
||||
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
|
||||
|
||||
When you submit a pull request, a CLA bot will automatically determine whether you need to provide
|
||||
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
|
||||
provided by the bot. You will only need to do this once across all repos using our CLA.
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
||||
### How to Contribute
|
||||
|
||||
You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
|
||||
|
||||
<div align="center">
|
||||
|
||||
| | All | Especially Needs Help from Community |
|
||||
| ---------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
|
||||
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
|
||||
|
||||
</div>
|
||||
|
||||
### Running Tests and Checks
|
||||
|
||||
- Navigate to the MarkItDown package:
|
||||
|
||||
```sh
|
||||
cd packages/markitdown
|
||||
```
|
||||
|
||||
- Install `hatch` in your environment and run tests:
|
||||
|
||||
```sh
|
||||
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
|
||||
hatch shell
|
||||
hatch test
|
||||
```
|
||||
|
||||
(Alternative) Use the Devcontainer which has all the dependencies installed:
|
||||
|
||||
```sh
|
||||
# Reopen the project in Devcontainer and run:
|
||||
hatch test
|
||||
```
|
||||
|
||||
- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
|
||||
|
||||
### Contributing 3rd-party Plugins
|
||||
|
||||
You can also contribute by creating and sharing 3rd party plugins. See `packages/markitdown-sample-plugin` for more details.
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
trademarks or logos is subject to and must follow
|
||||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
|
||||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
|
||||
Any use of third-party trademarks or logos are subject to those third-party's policies.
|
||||
We plan to return an OpenAI compatible response, which can be used by most LLM clients.
|
||||
41
SECURITY.md
41
SECURITY.md
|
|
@ -1,41 +0,0 @@
|
|||
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
|
||||
|
||||
## Security
|
||||
|
||||
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
|
||||
|
||||
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
|
||||
|
||||
## Reporting Security Issues
|
||||
|
||||
**Please do not report security vulnerabilities through public GitHub issues.**
|
||||
|
||||
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
|
||||
|
||||
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
|
||||
|
||||
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
|
||||
|
||||
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
|
||||
|
||||
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
|
||||
* Full paths of source file(s) related to the manifestation of the issue
|
||||
* The location of the affected source code (tag/branch/commit or direct URL)
|
||||
* Any special configuration required to reproduce the issue
|
||||
* Step-by-step instructions to reproduce the issue
|
||||
* Proof-of-concept or exploit code (if possible)
|
||||
* Impact of the issue, including how an attacker might exploit the issue
|
||||
|
||||
This information will help us triage your report more quickly.
|
||||
|
||||
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
|
||||
|
||||
## Preferred Languages
|
||||
|
||||
We prefer all communications to be in English.
|
||||
|
||||
## Policy
|
||||
|
||||
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
|
||||
|
||||
<!-- END MICROSOFT SECURITY.MD BLOCK -->
|
||||
25
SUPPORT.md
25
SUPPORT.md
|
|
@ -1,25 +0,0 @@
|
|||
# TODO: The maintainer of this repo has not yet edited this file
|
||||
|
||||
**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
|
||||
|
||||
- **No CSS support:** Fill out this template with information about how to file issues and get help.
|
||||
- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
|
||||
- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
|
||||
|
||||
*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
|
||||
|
||||
# Support
|
||||
|
||||
## How to file issues and get help
|
||||
|
||||
This project uses GitHub Issues to track bugs and feature requests. Please search the existing
|
||||
issues before filing new issues to avoid duplicates. For new issues, file your bug or
|
||||
feature request as a new Issue.
|
||||
|
||||
For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
|
||||
FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
|
||||
CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
|
||||
|
||||
## Microsoft Support Policy
|
||||
|
||||
Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
# MarkItDown
|
||||
|
||||
> [!IMPORTANT]
|
||||
> MarkItDown is a Python package and command-line utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||
>
|
||||
> For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
|
||||
|
||||
## Installation
|
||||
|
||||
From PyPI:
|
||||
|
||||
```bash
|
||||
pip install markitdown[all]
|
||||
```
|
||||
|
||||
From source:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e packages/markitdown[all]
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command-Line
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf > document.md
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
### More Information
|
||||
|
||||
For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
trademarks or logos is subject to and must follow
|
||||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
|
||||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
|
||||
Any use of third-party trademarks or logos are subject to those third-party's policies.
|
||||
|
|
@ -1,232 +0,0 @@
|
|||
# THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
||||
|
||||
**Do Not Translate or Localize**
|
||||
|
||||
This project incorporates components from the projects listed below. The original copyright notices and the licenses
|
||||
under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly
|
||||
granted herein, whether by implication, estoppel or otherwise.
|
||||
|
||||
1.dwml (https://github.com/xiilei/dwml)
|
||||
|
||||
dwml NOTICES AND INFORMATION BEGIN HERE
|
||||
|
||||
-----------------------------------------
|
||||
|
||||
NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including
|
||||
placeholders for the copyright owner and year.
|
||||
|
||||
NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented.
|
||||
The following section summarizes these changes. The full details are available in the MarkItDown source code
|
||||
repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160)
|
||||
|
||||
This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which
|
||||
lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code
|
||||
according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of
|
||||
the file is not used.
|
||||
|
||||
-----------------------------------------
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
-----------------------------------------
|
||||
END OF dwml NOTICES AND INFORMATION
|
||||
|
|
@ -1,770 +0,0 @@
|
|||
import copy
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
import warnings
|
||||
import traceback
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Any, List, Dict, Optional, Union, BinaryIO
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from warnings import warn
|
||||
import requests
|
||||
import magika
|
||||
import charset_normalizer
|
||||
import codecs
|
||||
|
||||
from ._stream_info import StreamInfo
|
||||
from ._uri_utils import parse_data_uri, file_uri_to_path
|
||||
|
||||
from .converters import (
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
RssConverter,
|
||||
WikipediaConverter,
|
||||
YouTubeConverter,
|
||||
IpynbConverter,
|
||||
BingSerpConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
ImageConverter,
|
||||
AudioConverter,
|
||||
OutlookMsgConverter,
|
||||
ZipConverter,
|
||||
EpubConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
CsvConverter,
|
||||
)
|
||||
|
||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
|
||||
_plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet.
|
||||
|
||||
|
||||
def _load_plugins() -> Union[None, List[Any]]:
|
||||
"""Lazy load plugins, exiting early if already loaded."""
|
||||
global _plugins
|
||||
|
||||
# Skip if we've already loaded plugins
|
||||
if _plugins is not None:
|
||||
return _plugins
|
||||
|
||||
# Load plugins
|
||||
_plugins = []
|
||||
for entry_point in entry_points(group="markitdown.plugin"):
|
||||
try:
|
||||
_plugins.append(entry_point.load())
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
|
||||
|
||||
return _plugins
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class ConverterRegistration:
|
||||
"""A registration of a converter with its priority and other metadata."""
|
||||
|
||||
converter: DocumentConverter
|
||||
priority: float
|
||||
|
||||
|
||||
class MarkItDown:
|
||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||
This reader will convert common file-types or webpages to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
enable_builtins: Union[None, bool] = None,
|
||||
enable_plugins: Union[None, bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self._builtins_enabled = False
|
||||
self._plugins_enabled = False
|
||||
|
||||
requests_session = kwargs.get("requests_session")
|
||||
if requests_session is None:
|
||||
self._requests_session = requests.Session()
|
||||
else:
|
||||
self._requests_session = requests_session
|
||||
|
||||
self._magika = magika.Magika()
|
||||
|
||||
# TODO - remove these (see enable_builtins)
|
||||
self._llm_client: Any = None
|
||||
self._llm_model: Union[str | None] = None
|
||||
self._exiftool_path: Union[str | None] = None
|
||||
self._style_map: Union[str | None] = None
|
||||
|
||||
# Register the converters
|
||||
self._converters: List[ConverterRegistration] = []
|
||||
|
||||
if (
|
||||
enable_builtins is None or enable_builtins
|
||||
): # Default to True when not specified
|
||||
self.enable_builtins(**kwargs)
|
||||
|
||||
if enable_plugins:
|
||||
self.enable_plugins(**kwargs)
|
||||
|
||||
def enable_builtins(self, **kwargs) -> None:
|
||||
"""
|
||||
Enable and register built-in converters.
|
||||
Built-in converters are enabled by default.
|
||||
This method should only be called once, if built-ins were initially disabled.
|
||||
"""
|
||||
if not self._builtins_enabled:
|
||||
# TODO: Move these into converter constructors
|
||||
self._llm_client = kwargs.get("llm_client")
|
||||
self._llm_model = kwargs.get("llm_model")
|
||||
self._exiftool_path = kwargs.get("exiftool_path")
|
||||
self._style_map = kwargs.get("style_map")
|
||||
|
||||
if self._exiftool_path is None:
|
||||
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||
|
||||
# Still none? Check well-known paths
|
||||
if self._exiftool_path is None:
|
||||
candidate = shutil.which("exiftool")
|
||||
if candidate:
|
||||
candidate = os.path.abspath(candidate)
|
||||
if any(
|
||||
d == os.path.dirname(candidate)
|
||||
for d in [
|
||||
"/usr/bin",
|
||||
"/usr/local/bin",
|
||||
"/opt",
|
||||
"/opt/bin",
|
||||
"/opt/local/bin",
|
||||
"/opt/homebrew/bin",
|
||||
"C:\\Windows\\System32",
|
||||
"C:\\Program Files",
|
||||
"C:\\Program Files (x86)",
|
||||
]
|
||||
):
|
||||
self._exiftool_path = candidate
|
||||
|
||||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
self.register_converter(
|
||||
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(RssConverter())
|
||||
self.register_converter(WikipediaConverter())
|
||||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
self.register_converter(AudioConverter())
|
||||
self.register_converter(ImageConverter())
|
||||
self.register_converter(IpynbConverter())
|
||||
self.register_converter(PdfConverter())
|
||||
self.register_converter(OutlookMsgConverter())
|
||||
self.register_converter(EpubConverter())
|
||||
self.register_converter(CsvConverter())
|
||||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||
if docintel_endpoint is not None:
|
||||
docintel_args: Dict[str, Any] = {}
|
||||
docintel_args["endpoint"] = docintel_endpoint
|
||||
|
||||
docintel_credential = kwargs.get("docintel_credential")
|
||||
if docintel_credential is not None:
|
||||
docintel_args["credential"] = docintel_credential
|
||||
|
||||
docintel_types = kwargs.get("docintel_file_types")
|
||||
if docintel_types is not None:
|
||||
docintel_args["file_types"] = docintel_types
|
||||
|
||||
self.register_converter(
|
||||
DocumentIntelligenceConverter(**docintel_args),
|
||||
)
|
||||
|
||||
self._builtins_enabled = True
|
||||
else:
|
||||
warn("Built-in converters are already enabled.", RuntimeWarning)
|
||||
|
||||
def enable_plugins(self, **kwargs) -> None:
|
||||
"""
|
||||
Enable and register converters provided by plugins.
|
||||
Plugins are disabled by default.
|
||||
This method should only be called once, if plugins were initially disabled.
|
||||
"""
|
||||
if not self._plugins_enabled:
|
||||
# Load plugins
|
||||
plugins = _load_plugins()
|
||||
assert plugins is not None
|
||||
for plugin in plugins:
|
||||
try:
|
||||
plugin.register_converters(self, **kwargs)
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
|
||||
self._plugins_enabled = True
|
||||
else:
|
||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
source: Union[str, requests.Response, Path, BinaryIO],
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
- source: can be a path (str or Path), url, or a requests.response object
|
||||
- stream_info: optional stream info to use for the conversion. If None, infer from source
|
||||
- kwargs: additional arguments to pass to the converter
|
||||
"""
|
||||
|
||||
# Local path or url
|
||||
if isinstance(source, str):
|
||||
if (
|
||||
source.startswith("http:")
|
||||
or source.startswith("https:")
|
||||
or source.startswith("file:")
|
||||
or source.startswith("data:")
|
||||
):
|
||||
# Rename the url argument to mock_url
|
||||
# (Deprecated -- use stream_info)
|
||||
_kwargs = {k: v for k, v in kwargs.items()}
|
||||
if "url" in _kwargs:
|
||||
_kwargs["mock_url"] = _kwargs["url"]
|
||||
del _kwargs["url"]
|
||||
|
||||
return self.convert_uri(source, stream_info=stream_info, **_kwargs)
|
||||
else:
|
||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||
# Path object
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||
# Request response
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, stream_info=stream_info, **kwargs)
|
||||
# Binary stream
|
||||
elif (
|
||||
hasattr(source, "read")
|
||||
and callable(source.read)
|
||||
and not isinstance(source, io.TextIOBase)
|
||||
):
|
||||
return self.convert_stream(source, stream_info=stream_info, **kwargs)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
|
||||
)
|
||||
|
||||
def convert_local(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if isinstance(path, Path):
|
||||
path = str(path)
|
||||
|
||||
# Build a base StreamInfo object from which to start guesses
|
||||
base_guess = StreamInfo(
|
||||
local_path=path,
|
||||
extension=os.path.splitext(path)[1],
|
||||
filename=os.path.basename(path),
|
||||
)
|
||||
|
||||
# Extend the base_guess with any additional info from the arguments
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
with open(path, "rb") as fh:
|
||||
guesses = self._get_stream_info_guesses(
|
||||
file_stream=fh, base_guess=base_guess
|
||||
)
|
||||
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def convert_stream(
|
||||
self,
|
||||
stream: BinaryIO,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
guesses: List[StreamInfo] = []
|
||||
|
||||
# Do we have anything on which to base a guess?
|
||||
base_guess = None
|
||||
if stream_info is not None or file_extension is not None or url is not None:
|
||||
# Start with a non-Null base guess
|
||||
if stream_info is None:
|
||||
base_guess = StreamInfo()
|
||||
else:
|
||||
base_guess = stream_info
|
||||
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||
if not stream.seekable():
|
||||
buffer = io.BytesIO()
|
||||
while True:
|
||||
chunk = stream.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
stream = buffer
|
||||
|
||||
# Add guesses based on stream content
|
||||
guesses = self._get_stream_info_guesses(
|
||||
file_stream=stream, base_guess=base_guess or StreamInfo()
|
||||
)
|
||||
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def convert_url(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None,
|
||||
mock_url: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
"""Alias for convert_uri()"""
|
||||
# convert_url will likely be deprecated in the future in favor of convert_uri
|
||||
return self.convert_uri(
|
||||
url,
|
||||
stream_info=stream_info,
|
||||
file_extension=file_extension,
|
||||
mock_url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def convert_uri(
|
||||
self,
|
||||
uri: str,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
mock_url: Optional[
|
||||
str
|
||||
] = None, # Mock the request as if it came from a different URL
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
uri = uri.strip()
|
||||
|
||||
# File URIs
|
||||
if uri.startswith("file:"):
|
||||
netloc, path = file_uri_to_path(uri)
|
||||
if netloc and netloc != "localhost":
|
||||
raise ValueError(
|
||||
f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
|
||||
)
|
||||
return self.convert_local(
|
||||
path,
|
||||
stream_info=stream_info,
|
||||
file_extension=file_extension,
|
||||
url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Data URIs
|
||||
elif uri.startswith("data:"):
|
||||
mimetype, attributes, data = parse_data_uri(uri)
|
||||
|
||||
base_guess = StreamInfo(
|
||||
mimetype=mimetype,
|
||||
charset=attributes.get("charset"),
|
||||
)
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
|
||||
return self.convert_stream(
|
||||
io.BytesIO(data),
|
||||
stream_info=base_guess,
|
||||
file_extension=file_extension,
|
||||
url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
# HTTP/HTTPS URIs
|
||||
elif uri.startswith("http:") or uri.startswith("https:"):
|
||||
response = self._requests_session.get(uri, stream=True)
|
||||
response.raise_for_status()
|
||||
return self.convert_response(
|
||||
response,
|
||||
stream_info=stream_info,
|
||||
file_extension=file_extension,
|
||||
url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
|
||||
)
|
||||
|
||||
def convert_response(
|
||||
self,
|
||||
response: requests.Response,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# If there is a content-type header, get the mimetype and charset (if present)
|
||||
mimetype: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
|
||||
if "content-type" in response.headers:
|
||||
parts = response.headers["content-type"].split(";")
|
||||
mimetype = parts.pop(0).strip()
|
||||
for part in parts:
|
||||
if part.strip().startswith("charset="):
|
||||
_charset = part.split("=")[1].strip()
|
||||
if len(_charset) > 0:
|
||||
charset = _charset
|
||||
|
||||
# If there is a content-disposition header, get the filename and possibly the extension
|
||||
filename: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
if "content-disposition" in response.headers:
|
||||
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
|
||||
if m:
|
||||
filename = m.group(1).strip("\"'")
|
||||
_, _extension = os.path.splitext(filename)
|
||||
if len(_extension) > 0:
|
||||
extension = _extension
|
||||
|
||||
# If there is still no filename, try to read it from the url
|
||||
if filename is None:
|
||||
parsed_url = urlparse(response.url)
|
||||
_, _extension = os.path.splitext(parsed_url.path)
|
||||
if len(_extension) > 0: # Looks like this might be a file!
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
extension = _extension
|
||||
|
||||
# Create an initial guess from all this information
|
||||
base_guess = StreamInfo(
|
||||
mimetype=mimetype,
|
||||
charset=charset,
|
||||
filename=filename,
|
||||
extension=extension,
|
||||
url=response.url,
|
||||
)
|
||||
|
||||
# Update with any additional info from the arguments
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
# Read into BytesIO
|
||||
buffer = io.BytesIO()
|
||||
for chunk in response.iter_content(chunk_size=512):
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert
|
||||
guesses = self._get_stream_info_guesses(
|
||||
file_stream=buffer, base_guess=base_guess
|
||||
)
|
||||
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def _convert(
|
||||
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
res: Union[None, DocumentConverterResult] = None
|
||||
|
||||
# Keep track of which converters throw exceptions
|
||||
failed_attempts: List[FailedConversionAttempt] = []
|
||||
|
||||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||
|
||||
# Remember the initial stream position so that we can return to it
|
||||
cur_pos = file_stream.tell()
|
||||
|
||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||
for converter_registration in sorted_registrations:
|
||||
converter = converter_registration.converter
|
||||
# Sanity check -- make sure the cur_pos is still the same
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"File stream position should NOT change between guess iterations"
|
||||
|
||||
_kwargs = {k: v for k, v in kwargs.items()}
|
||||
|
||||
# Copy any additional global options
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
_kwargs["llm_client"] = self._llm_client
|
||||
|
||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||
_kwargs["llm_model"] = self._llm_model
|
||||
|
||||
if "style_map" not in _kwargs and self._style_map is not None:
|
||||
_kwargs["style_map"] = self._style_map
|
||||
|
||||
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
||||
_kwargs["exiftool_path"] = self._exiftool_path
|
||||
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._converters
|
||||
|
||||
# Add legaxy kwargs
|
||||
if stream_info is not None:
|
||||
if stream_info.extension is not None:
|
||||
_kwargs["file_extension"] = stream_info.extension
|
||||
|
||||
if stream_info.url is not None:
|
||||
_kwargs["url"] = stream_info.url
|
||||
|
||||
# Check if the converter will accept the file, and if so, try to convert it
|
||||
_accepts = False
|
||||
try:
|
||||
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# accept() should not have changed the file stream position
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
|
||||
|
||||
# Attempt the conversion
|
||||
if _accepts:
|
||||
try:
|
||||
res = converter.convert(file_stream, stream_info, **_kwargs)
|
||||
except Exception:
|
||||
failed_attempts.append(
|
||||
FailedConversionAttempt(
|
||||
converter=converter, exc_info=sys.exc_info()
|
||||
)
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
if res is not None:
|
||||
# Normalize the content
|
||||
res.text_content = "\n".join(
|
||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||
)
|
||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||
return res
|
||||
|
||||
# If we got this far without success, report any exceptions
|
||||
if len(failed_attempts) > 0:
|
||||
raise FileConversionException(attempts=failed_attempts)
|
||||
|
||||
# Nothing can handle it!
|
||||
raise UnsupportedFormatException(
|
||||
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||
)
|
||||
|
||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||
"""DEPRECATED: User register_converter instead."""
|
||||
warn(
|
||||
"register_page_converter is deprecated. Use register_converter instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(
|
||||
self,
|
||||
converter: DocumentConverter,
|
||||
*,
|
||||
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
) -> None:
|
||||
"""
|
||||
Register a DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
|
||||
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
|
||||
being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. The registration's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._converters.insert(
|
||||
0, ConverterRegistration(converter=converter, priority=priority)
|
||||
)
|
||||
|
||||
def _get_stream_info_guesses(
|
||||
self, file_stream: BinaryIO, base_guess: StreamInfo
|
||||
) -> List[StreamInfo]:
|
||||
"""
|
||||
Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika).
|
||||
"""
|
||||
guesses: List[StreamInfo] = []
|
||||
|
||||
# Enhance the base guess with information based on the extension or mimetype
|
||||
enhanced_guess = base_guess.copy_and_update()
|
||||
|
||||
# If there's an extension and no mimetype, try to guess the mimetype
|
||||
if base_guess.mimetype is None and base_guess.extension is not None:
|
||||
_m, _ = mimetypes.guess_type(
|
||||
"placeholder" + base_guess.extension, strict=False
|
||||
)
|
||||
if _m is not None:
|
||||
enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m)
|
||||
|
||||
# If there's a mimetype and no extension, try to guess the extension
|
||||
if base_guess.mimetype is not None and base_guess.extension is None:
|
||||
_e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False)
|
||||
if len(_e) > 0:
|
||||
enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0])
|
||||
|
||||
# Call magika to guess from the stream
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
result = self._magika.identify_stream(file_stream)
|
||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||
# If it's text, also guess the charset
|
||||
charset = None
|
||||
if result.prediction.output.is_text:
|
||||
# Read the first 4k to guess the charset
|
||||
file_stream.seek(cur_pos)
|
||||
stream_page = file_stream.read(4096)
|
||||
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
||||
|
||||
if charset_result is not None:
|
||||
charset = self._normalize_charset(charset_result.encoding)
|
||||
|
||||
# Normalize the first extension listed
|
||||
guessed_extension = None
|
||||
if len(result.prediction.output.extensions) > 0:
|
||||
guessed_extension = "." + result.prediction.output.extensions[0]
|
||||
|
||||
# Determine if the guess is compatible with the base guess
|
||||
compatible = True
|
||||
if (
|
||||
base_guess.mimetype is not None
|
||||
and base_guess.mimetype != result.prediction.output.mime_type
|
||||
):
|
||||
compatible = False
|
||||
|
||||
if (
|
||||
base_guess.extension is not None
|
||||
and base_guess.extension.lstrip(".")
|
||||
not in result.prediction.output.extensions
|
||||
):
|
||||
compatible = False
|
||||
|
||||
if (
|
||||
base_guess.charset is not None
|
||||
and self._normalize_charset(base_guess.charset) != charset
|
||||
):
|
||||
compatible = False
|
||||
|
||||
if compatible:
|
||||
# Add the compatible base guess
|
||||
guesses.append(
|
||||
StreamInfo(
|
||||
mimetype=base_guess.mimetype
|
||||
or result.prediction.output.mime_type,
|
||||
extension=base_guess.extension or guessed_extension,
|
||||
charset=base_guess.charset or charset,
|
||||
filename=base_guess.filename,
|
||||
local_path=base_guess.local_path,
|
||||
url=base_guess.url,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# The magika guess was incompatible with the base guess, so add both guesses
|
||||
guesses.append(enhanced_guess)
|
||||
guesses.append(
|
||||
StreamInfo(
|
||||
mimetype=result.prediction.output.mime_type,
|
||||
extension=guessed_extension,
|
||||
charset=charset,
|
||||
filename=base_guess.filename,
|
||||
local_path=base_guess.local_path,
|
||||
url=base_guess.url,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# There were no other guesses, so just add the base guess
|
||||
guesses.append(enhanced_guess)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return guesses
|
||||
|
||||
def _normalize_charset(self, charset: str | None) -> str | None:
|
||||
"""
|
||||
Normalize a charset string to a canonical form.
|
||||
"""
|
||||
if charset is None:
|
||||
return None
|
||||
try:
|
||||
return codecs.lookup(charset).name
|
||||
except LookupError:
|
||||
return charset
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class StreamInfo:
|
||||
"""The StreamInfo class is used to store information about a file stream.
|
||||
All fields can be None, and will depend on how the stream was opened.
|
||||
"""
|
||||
|
||||
mimetype: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
filename: Optional[
|
||||
str
|
||||
] = None # From local path, url, or Content-Disposition header
|
||||
local_path: Optional[str] = None # If read from disk
|
||||
url: Optional[str] = None # If read from url
|
||||
|
||||
def copy_and_update(self, *args, **kwargs):
|
||||
"""Copy the StreamInfo object and update it with the given StreamInfo
|
||||
instance and/or other keyword arguments."""
|
||||
new_info = asdict(self)
|
||||
|
||||
for si in args:
|
||||
assert isinstance(si, StreamInfo)
|
||||
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
|
||||
|
||||
if len(kwargs) > 0:
|
||||
new_info.update(kwargs)
|
||||
|
||||
return StreamInfo(**new_info)
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
import io
|
||||
import re
|
||||
import base64
|
||||
import binascii
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class BingSerpConverter(DocumentConverter):
|
||||
"""
|
||||
Handle Bing results pages (only the organic search results).
|
||||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* Bing.
|
||||
"""
|
||||
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||
# Not a Bing SERP URL
|
||||
return False
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
assert stream_info.url is not None
|
||||
|
||||
# Parse the query parameters
|
||||
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||
query = parsed_params.get("q", [""])[0]
|
||||
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Clean up some formatting
|
||||
for tptt in soup.find_all(class_="tptt"):
|
||||
if hasattr(tptt, "string") and tptt.string:
|
||||
tptt.string += " "
|
||||
for slug in soup.find_all(class_="algoSlug_icon"):
|
||||
slug.extract()
|
||||
|
||||
# Parse the algorithmic results
|
||||
_markdownify = _CustomMarkdownify(**kwargs)
|
||||
results = list()
|
||||
for result in soup.find_all(class_="b_algo"):
|
||||
if not hasattr(result, "find_all"):
|
||||
continue
|
||||
|
||||
# Rewrite redirect urls
|
||||
for a in result.find_all("a", href=True):
|
||||
parsed_href = urlparse(a["href"])
|
||||
qs = parse_qs(parsed_href.query)
|
||||
|
||||
# The destination is contained in the u parameter,
|
||||
# but appears to be base64 encoded, with some prefix
|
||||
if "u" in qs:
|
||||
u = (
|
||||
qs["u"][0][2:].strip() + "=="
|
||||
) # Python 3 doesn't care about extra padding
|
||||
|
||||
try:
|
||||
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
||||
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
except binascii.Error:
|
||||
pass
|
||||
|
||||
# Convert to markdown
|
||||
md_result = _markdownify.convert_soup(result).strip()
|
||||
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
||||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||
|
||||
webpage_text = (
|
||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||
+ "\n\n".join(results)
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
)
|
||||
|
|
@ -1,250 +0,0 @@
|
|||
import sys
|
||||
import re
|
||||
import os
|
||||
from typing import BinaryIO, Any, List, Optional, Union
|
||||
from enum import Enum
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.core.credentials import AzureKeyCredential, TokenCredential
|
||||
from azure.identity import DefaultAzureCredential
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
# Define these types for type hinting when the package is not available
|
||||
class AzureKeyCredential:
|
||||
pass
|
||||
|
||||
class TokenCredential:
|
||||
pass
|
||||
|
||||
class DocumentIntelligenceClient:
|
||||
pass
|
||||
|
||||
class AnalyzeDocumentRequest:
|
||||
pass
|
||||
|
||||
class AnalyzeResult:
|
||||
pass
|
||||
|
||||
class DocumentAnalysisFeature:
|
||||
pass
|
||||
|
||||
class DefaultAzureCredential:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
# This constant is a temporary fix until the bug is resolved.
|
||||
CONTENT_FORMAT = "markdown"
|
||||
|
||||
|
||||
class DocumentIntelligenceFileType(str, Enum):
|
||||
"""Enum of file types supported by the Document Intelligence Converter."""
|
||||
|
||||
# No OCR
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
XLSX = "xlsx"
|
||||
HTML = "html"
|
||||
# OCR
|
||||
PDF = "pdf"
|
||||
JPEG = "jpeg"
|
||||
PNG = "png"
|
||||
BMP = "bmp"
|
||||
TIFF = "tiff"
|
||||
|
||||
|
||||
def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||
"""Get the MIME type prefixes for the given file types."""
|
||||
prefixes: List[str] = []
|
||||
for type_ in types:
|
||||
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||
prefixes.append(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||
prefixes.append(
|
||||
"application/vnd.openxmlformats-officedocument.presentationml"
|
||||
)
|
||||
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||
prefixes.append(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
)
|
||||
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||
prefixes.append("application/pdf")
|
||||
prefixes.append("application/x-pdf")
|
||||
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||
prefixes.append("image/jpeg")
|
||||
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||
prefixes.append("image/png")
|
||||
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||
prefixes.append("image/bmp")
|
||||
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||
prefixes.append("image/tiff")
|
||||
return prefixes
|
||||
|
||||
|
||||
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||
"""Get the file extensions for the given file types."""
|
||||
extensions: List[str] = []
|
||||
for type_ in types:
|
||||
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||
extensions.append(".docx")
|
||||
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||
extensions.append(".pptx")
|
||||
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||
extensions.append(".xlsx")
|
||||
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||
extensions.append(".pdf")
|
||||
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||
extensions.append(".jpg")
|
||||
extensions.append(".jpeg")
|
||||
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||
extensions.append(".png")
|
||||
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||
extensions.append(".bmp")
|
||||
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||
extensions.append(".tiff")
|
||||
return extensions
|
||||
|
||||
|
||||
class DocumentIntelligenceConverter(DocumentConverter):
|
||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
endpoint: str,
|
||||
api_version: str = "2024-07-31-preview",
|
||||
credential: AzureKeyCredential | TokenCredential | None = None,
|
||||
file_types: List[DocumentIntelligenceFileType] = [
|
||||
DocumentIntelligenceFileType.DOCX,
|
||||
DocumentIntelligenceFileType.PPTX,
|
||||
DocumentIntelligenceFileType.XLSX,
|
||||
DocumentIntelligenceFileType.PDF,
|
||||
DocumentIntelligenceFileType.JPEG,
|
||||
DocumentIntelligenceFileType.PNG,
|
||||
DocumentIntelligenceFileType.BMP,
|
||||
DocumentIntelligenceFileType.TIFF,
|
||||
],
|
||||
):
|
||||
"""
|
||||
Initialize the DocumentIntelligenceConverter.
|
||||
|
||||
Args:
|
||||
endpoint (str): The endpoint for the Document Intelligence service.
|
||||
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
|
||||
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
|
||||
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
|
||||
"""
|
||||
|
||||
super().__init__()
|
||||
self._file_types = file_types
|
||||
|
||||
# Raise an error if the dependencies are not available.
|
||||
# This is different than other converters since this one isn't even instantiated
|
||||
# unless explicitly requested.
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
if credential is None:
|
||||
if os.environ.get("AZURE_API_KEY") is None:
|
||||
credential = DefaultAzureCredential()
|
||||
else:
|
||||
credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.doc_intel_client = DocumentIntelligenceClient(
|
||||
endpoint=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
credential=credential,
|
||||
)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in _get_file_extensions(self._file_types):
|
||||
return True
|
||||
|
||||
for prefix in _get_mime_type_prefixes(self._file_types):
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
|
||||
"""
|
||||
Helper needed to determine which analysis features to use.
|
||||
Certain document analysis features are not availiable for
|
||||
office filetypes (.xlsx, .pptx, .html, .docx)
|
||||
"""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# Types that don't support ocr
|
||||
no_ocr_types = [
|
||||
DocumentIntelligenceFileType.DOCX,
|
||||
DocumentIntelligenceFileType.PPTX,
|
||||
DocumentIntelligenceFileType.XLSX,
|
||||
DocumentIntelligenceFileType.HTML,
|
||||
]
|
||||
|
||||
if extension in _get_file_extensions(no_ocr_types):
|
||||
return []
|
||||
|
||||
for prefix in _get_mime_type_prefixes(no_ocr_types):
|
||||
if mimetype.startswith(prefix):
|
||||
return []
|
||||
|
||||
return [
|
||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||
]
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Extract the text using Azure Document Intelligence
|
||||
poller = self.doc_intel_client.begin_analyze_document(
|
||||
model_id="prebuilt-layout",
|
||||
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
|
||||
features=self._analysis_features(stream_info),
|
||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||
)
|
||||
result: AnalyzeResult = poller.result()
|
||||
|
||||
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
||||
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
||||
return DocumentConverterResult(markdown=markdown_text)
|
||||
|
|
@ -1,147 +0,0 @@
|
|||
import os
|
||||
import zipfile
|
||||
import xml.dom.minidom as minidom
|
||||
|
||||
from typing import BinaryIO, Any, Dict, List
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/epub",
|
||||
"application/epub+zip",
|
||||
"application/x-epub+zip",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".epub"]
|
||||
|
||||
MIME_TYPE_MAPPING = {
|
||||
".html": "text/html",
|
||||
".xhtml": "application/xhtml+xml",
|
||||
}
|
||||
|
||||
|
||||
class EpubConverter(HtmlConverter):
|
||||
"""
|
||||
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
with zipfile.ZipFile(file_stream, "r") as z:
|
||||
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
|
||||
|
||||
# Locate content.opf
|
||||
container_dom = minidom.parse(z.open("META-INF/container.xml"))
|
||||
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
|
||||
"full-path"
|
||||
)
|
||||
|
||||
# Parse content.opf
|
||||
opf_dom = minidom.parse(z.open(opf_path))
|
||||
metadata: Dict[str, Any] = {
|
||||
"title": self._get_text_from_node(opf_dom, "dc:title"),
|
||||
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
|
||||
"language": self._get_text_from_node(opf_dom, "dc:language"),
|
||||
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
|
||||
"date": self._get_text_from_node(opf_dom, "dc:date"),
|
||||
"description": self._get_text_from_node(opf_dom, "dc:description"),
|
||||
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
|
||||
}
|
||||
|
||||
# Extract manifest items (ID → href mapping)
|
||||
manifest = {
|
||||
item.getAttribute("id"): item.getAttribute("href")
|
||||
for item in opf_dom.getElementsByTagName("item")
|
||||
}
|
||||
|
||||
# Extract spine order (ID refs)
|
||||
spine_items = opf_dom.getElementsByTagName("itemref")
|
||||
spine_order = [item.getAttribute("idref") for item in spine_items]
|
||||
|
||||
# Convert spine order to actual file paths
|
||||
base_path = "/".join(
|
||||
opf_path.split("/")[:-1]
|
||||
) # Get base directory of content.opf
|
||||
spine = [
|
||||
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
|
||||
for item_id in spine_order
|
||||
if item_id in manifest
|
||||
]
|
||||
|
||||
# Extract and convert the content
|
||||
markdown_content: List[str] = []
|
||||
for file in spine:
|
||||
if file in z.namelist():
|
||||
with z.open(file) as f:
|
||||
filename = os.path.basename(file)
|
||||
extension = os.path.splitext(filename)[1].lower()
|
||||
mimetype = MIME_TYPE_MAPPING.get(extension)
|
||||
converted_content = self._html_converter.convert(
|
||||
f,
|
||||
StreamInfo(
|
||||
mimetype=mimetype,
|
||||
extension=extension,
|
||||
filename=filename,
|
||||
),
|
||||
)
|
||||
markdown_content.append(converted_content.markdown.strip())
|
||||
|
||||
# Format and add the metadata
|
||||
metadata_markdown = []
|
||||
for key, value in metadata.items():
|
||||
if isinstance(value, list):
|
||||
value = ", ".join(value)
|
||||
if value:
|
||||
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
|
||||
|
||||
markdown_content.insert(0, "\n".join(metadata_markdown))
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown="\n\n".join(markdown_content), title=metadata["title"]
|
||||
)
|
||||
|
||||
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
|
||||
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
|
||||
texts = self._get_all_texts_from_nodes(dom, tag_name)
|
||||
if len(texts) > 0:
|
||||
return texts[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def _get_all_texts_from_nodes(
|
||||
self, dom: minidom.Document, tag_name: str
|
||||
) -> List[str]:
|
||||
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
|
||||
texts: List[str] = []
|
||||
for node in dom.getElementsByTagName(tag_name):
|
||||
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
|
||||
texts.append(node.firstChild.nodeValue.strip())
|
||||
return texts
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
from typing import BinaryIO, Any, Union
|
||||
import base64
|
||||
import mimetypes
|
||||
from ._exiftool import exiftool_metadata
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
|
||||
|
||||
|
||||
class ImageConverter(DocumentConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
|
||||
if metadata:
|
||||
for f in [
|
||||
"ImageSize",
|
||||
"Title",
|
||||
"Caption",
|
||||
"Description",
|
||||
"Keywords",
|
||||
"Artist",
|
||||
"Author",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
"GPSPosition",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Try describing the image with GPT
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
llm_description = self._get_llm_description(
|
||||
file_stream,
|
||||
stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
|
||||
if llm_description is not None:
|
||||
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_content,
|
||||
)
|
||||
|
||||
def _get_llm_description(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
*,
|
||||
client,
|
||||
model,
|
||||
prompt=None,
|
||||
) -> Union[None, str]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
"_dummy" + (stream_info.extension or "")
|
||||
)
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
return response.choices[0].message.content
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
from typing import BinaryIO, Any
|
||||
import json
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import FileConversionException
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||
"application/json",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
# Read further to see if it's a notebook
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding)
|
||||
return (
|
||||
"nbformat" in notebook_content
|
||||
and "nbformat_minor" in notebook_content
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse and convert the notebook
|
||||
result = None
|
||||
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||
return self._convert(json.loads(notebook_content))
|
||||
|
||||
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
title = None
|
||||
|
||||
for cell in notebook_content.get("cells", []):
|
||||
cell_type = cell.get("cell_type", "")
|
||||
source_lines = cell.get("source", [])
|
||||
|
||||
if cell_type == "markdown":
|
||||
md_output.append("".join(source_lines))
|
||||
|
||||
# Extract the first # heading as title if not already found
|
||||
if title is None:
|
||||
for line in source_lines:
|
||||
if line.startswith("# "):
|
||||
title = line.lstrip("# ").strip()
|
||||
break
|
||||
|
||||
elif cell_type == "code":
|
||||
# Code cells are wrapped in Markdown code blocks
|
||||
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
||||
elif cell_type == "raw":
|
||||
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
||||
|
||||
md_text = "\n\n".join(md_output)
|
||||
|
||||
# Check for title in notebook metadata
|
||||
title = notebook_content.get("metadata", {}).get("title", title)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=title,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Error converting .ipynb file: {str(e)}"
|
||||
) from e
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
from typing import BinaryIO, Any, Union
|
||||
import base64
|
||||
import mimetypes
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
|
||||
def llm_caption(
|
||||
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
|
||||
) -> Union[None, str]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
return response.choices[0].message.content
|
||||
|
|
@ -1,149 +0,0 @@
|
|||
import sys
|
||||
from typing import Any, Union, BinaryIO
|
||||
from .._stream_info import StreamInfo
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
olefile = None
|
||||
try:
|
||||
import olefile # type: ignore[no-redef]
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.ms-outlook",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".msg"]
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||
|
||||
Uses the olefile package to parse the .msg file structure and extract:
|
||||
- Email headers (From, To, Subject)
|
||||
- Email body content
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# Check the extension and mimetype
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Brute force, check if we have an OLE file
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
if olefile and not olefile.isOleFile(file_stream):
|
||||
return False
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Brue force, check if it's an Outlook file
|
||||
try:
|
||||
if olefile is not None:
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
toc = "\n".join([str(stream) for stream in msg.listdir()])
|
||||
return (
|
||||
"__properties_version1.0" in toc
|
||||
and "__recip_version1.0_#00000000" in toc
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".msg",
|
||||
feature="outlook",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
assert (
|
||||
olefile is not None
|
||||
) # If we made it this far, olefile should be available
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
# Get headers
|
||||
headers = {
|
||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||
}
|
||||
|
||||
# Add headers to markdown
|
||||
for key, value in headers.items():
|
||||
if value:
|
||||
md_content += f"**{key}:** {value}\n"
|
||||
|
||||
md_content += "\n## Content\n\n"
|
||||
|
||||
# Get email body
|
||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||
if body:
|
||||
md_content += body
|
||||
|
||||
msg.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_content.strip(),
|
||||
title=headers.get("Subject"),
|
||||
)
|
||||
|
||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
assert olefile is not None
|
||||
assert isinstance(
|
||||
msg, olefile.OleFileIO
|
||||
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
||||
|
||||
try:
|
||||
if msg.exists(stream_path):
|
||||
data = msg.openstream(stream_path).read()
|
||||
# Try UTF-16 first (common for .msg files)
|
||||
try:
|
||||
return data.decode("utf-16-le").strip()
|
||||
except UnicodeDecodeError:
|
||||
# Fall back to UTF-8
|
||||
try:
|
||||
return data.decode("utf-8").strip()
|
||||
except UnicodeDecodeError:
|
||||
# Last resort - ignore errors
|
||||
return data.decode("utf-8", errors="ignore").strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
import sys
|
||||
import io
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/pdf",
|
||||
"application/x-pdf",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||
return DocumentConverterResult(
|
||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||
)
|
||||
|
|
@ -1,71 +0,0 @@
|
|||
import sys
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/",
|
||||
"application/json",
|
||||
"application/markdown",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".txt",
|
||||
".text",
|
||||
".md",
|
||||
".markdown",
|
||||
".json",
|
||||
".jsonl",
|
||||
]
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# If we have a charset, we can safely assume it's text
|
||||
# With Magika in the earlier stages, this handles most cases
|
||||
if stream_info.charset is not None:
|
||||
return True
|
||||
|
||||
# Otherwise, check the mimetype and extension
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
if stream_info.charset:
|
||||
text_content = file_stream.read().decode(stream_info.charset)
|
||||
else:
|
||||
text_content = str(from_bytes(file_stream.read()).best())
|
||||
|
||||
return DocumentConverterResult(markdown=text_content)
|
||||
|
|
@ -1,191 +0,0 @@
|
|||
from xml.dom import minidom
|
||||
from typing import BinaryIO, Any, Union
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
from .._stream_info import StreamInfo
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
PRECISE_MIME_TYPE_PREFIXES = [
|
||||
"application/rss",
|
||||
"application/rss+xml",
|
||||
"application/atom",
|
||||
"application/atom+xml",
|
||||
]
|
||||
|
||||
PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
|
||||
|
||||
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
]
|
||||
|
||||
CANDIDATE_FILE_EXTENSIONS = [
|
||||
".xml",
|
||||
]
|
||||
|
||||
|
||||
class RssConverter(DocumentConverter):
|
||||
"""Convert RSS / Atom type to markdown"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._kwargs = {}
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# Check for precise mimetypes and file extensions
|
||||
if extension in PRECISE_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in PRECISE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Check for precise mimetypes and file extensions
|
||||
if extension in CANDIDATE_FILE_EXTENSIONS:
|
||||
return self._check_xml(file_stream)
|
||||
|
||||
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return self._check_xml(file_stream)
|
||||
|
||||
return False
|
||||
|
||||
def _check_xml(self, file_stream: BinaryIO) -> bool:
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
doc = minidom.parse(file_stream)
|
||||
return self._feed_type(doc) is not None
|
||||
except BaseException as _:
|
||||
pass
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
return False
|
||||
|
||||
def _feed_type(self, doc: Any) -> str | None:
|
||||
if doc.getElementsByTagName("rss"):
|
||||
return "rss"
|
||||
elif doc.getElementsByTagName("feed"):
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
if root.getElementsByTagName("entry"):
|
||||
# An Atom feed must have a root element of <feed> and at least one <entry>
|
||||
return "atom"
|
||||
return None
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
self._kwargs = kwargs
|
||||
doc = minidom.parse(file_stream)
|
||||
feed_type = self._feed_type(doc)
|
||||
|
||||
if feed_type == "rss":
|
||||
return self._parse_rss_type(doc)
|
||||
elif feed_type == "atom":
|
||||
return self._parse_atom_type(doc)
|
||||
else:
|
||||
raise ValueError("Unknown feed type")
|
||||
|
||||
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||
"""Parse the type of an Atom feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
root = doc.getElementsByTagName("feed")[0]
|
||||
title = self._get_data_by_tag_name(root, "title")
|
||||
subtitle = self._get_data_by_tag_name(root, "subtitle")
|
||||
entries = root.getElementsByTagName("entry")
|
||||
md_text = f"# {title}\n"
|
||||
if subtitle:
|
||||
md_text += f"{subtitle}\n"
|
||||
for entry in entries:
|
||||
entry_title = self._get_data_by_tag_name(entry, "title")
|
||||
entry_summary = self._get_data_by_tag_name(entry, "summary")
|
||||
entry_updated = self._get_data_by_tag_name(entry, "updated")
|
||||
entry_content = self._get_data_by_tag_name(entry, "content")
|
||||
|
||||
if entry_title:
|
||||
md_text += f"\n## {entry_title}\n"
|
||||
if entry_updated:
|
||||
md_text += f"Updated on: {entry_updated}\n"
|
||||
if entry_summary:
|
||||
md_text += self._parse_content(entry_summary)
|
||||
if entry_content:
|
||||
md_text += self._parse_content(entry_content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=title,
|
||||
)
|
||||
|
||||
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||
"""Parse the type of an RSS feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
"""
|
||||
root = doc.getElementsByTagName("rss")[0]
|
||||
channel_list = root.getElementsByTagName("channel")
|
||||
if not channel_list:
|
||||
raise ValueError("No channel found in RSS feed")
|
||||
channel = channel_list[0]
|
||||
channel_title = self._get_data_by_tag_name(channel, "title")
|
||||
channel_description = self._get_data_by_tag_name(channel, "description")
|
||||
items = channel.getElementsByTagName("item")
|
||||
if channel_title:
|
||||
md_text = f"# {channel_title}\n"
|
||||
if channel_description:
|
||||
md_text += f"{channel_description}\n"
|
||||
for item in items:
|
||||
title = self._get_data_by_tag_name(item, "title")
|
||||
description = self._get_data_by_tag_name(item, "description")
|
||||
pubDate = self._get_data_by_tag_name(item, "pubDate")
|
||||
content = self._get_data_by_tag_name(item, "content:encoded")
|
||||
|
||||
if title:
|
||||
md_text += f"\n## {title}\n"
|
||||
if pubDate:
|
||||
md_text += f"Published on: {pubDate}\n"
|
||||
if description:
|
||||
md_text += self._parse_content(description)
|
||||
if content:
|
||||
md_text += self._parse_content(content)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=channel_title,
|
||||
)
|
||||
|
||||
def _parse_content(self, content: str) -> str:
|
||||
"""Parse the content of an RSS feed item"""
|
||||
try:
|
||||
# using bs4 because many RSS feeds have HTML-styled content
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
|
||||
except BaseException as _:
|
||||
return content
|
||||
|
||||
def _get_data_by_tag_name(
|
||||
self, element: minidom.Element, tag_name: str
|
||||
) -> Union[str, None]:
|
||||
"""Get data from first child element with the given tag name.
|
||||
Returns None when no such element is found.
|
||||
"""
|
||||
nodes = element.getElementsByTagName(tag_name)
|
||||
if not nodes:
|
||||
return None
|
||||
fc = nodes[0].firstChild
|
||||
if fc:
|
||||
if hasattr(fc, "data"):
|
||||
return fc.data
|
||||
return None
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
import io
|
||||
import sys
|
||||
from typing import BinaryIO
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
import speech_recognition as sr
|
||||
import pydub
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str:
|
||||
# Check for installed dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
if audio_format in ["wav", "aiff", "flac"]:
|
||||
audio_source = file_stream
|
||||
elif audio_format in ["mp3", "mp4"]:
|
||||
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
|
||||
|
||||
audio_source = io.BytesIO()
|
||||
audio_segment.export(audio_source, format="wav")
|
||||
audio_source.seek(0)
|
||||
else:
|
||||
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(audio_source) as source:
|
||||
audio = recognizer.record(source)
|
||||
transcript = recognizer.recognize_google(audio).strip()
|
||||
return "[No speech detected]" if transcript == "" else transcript
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
import io
|
||||
import re
|
||||
import bs4
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class WikipediaConverter(DocumentConverter):
|
||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* Wikipedia.
|
||||
"""
|
||||
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
||||
# Not a Wikipedia URL
|
||||
return False
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
# Print only the main content
|
||||
body_elm = soup.find("div", {"id": "mw-content-text"})
|
||||
title_elm = soup.find("span", {"class": "mw-page-title-main"})
|
||||
|
||||
webpage_text = ""
|
||||
main_title = None if soup.title is None else soup.title.string
|
||||
|
||||
if body_elm:
|
||||
# What's the title
|
||||
if title_elm and isinstance(title_elm, bs4.Tag):
|
||||
main_title = title_elm.string
|
||||
|
||||
# Convert the page
|
||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
|
||||
**kwargs
|
||||
).convert_soup(body_elm)
|
||||
else:
|
||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=main_title,
|
||||
)
|
||||
|
|
@ -1,224 +0,0 @@
|
|||
import sys
|
||||
import json
|
||||
import time
|
||||
import io
|
||||
import re
|
||||
import bs4
|
||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||
from urllib.parse import parse_qs, urlparse, unquote
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
# Optional YouTube transcription support
|
||||
try:
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
# Patch submitted upstream to fix the SyntaxWarning
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class YouTubeConverter(DocumentConverter):
|
||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* YouTube.
|
||||
"""
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
url = unquote(url)
|
||||
url = url.replace(r"\?", "?").replace(r"\=", "=")
|
||||
|
||||
if not url.startswith("https://www.youtube.com/watch?"):
|
||||
# Not a YouTube URL
|
||||
return False
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Read the meta tags
|
||||
metadata: Dict[str, str] = {}
|
||||
|
||||
if soup.title and soup.title.string:
|
||||
metadata["title"] = soup.title.string
|
||||
|
||||
for meta in soup(["meta"]):
|
||||
if not isinstance(meta, bs4.Tag):
|
||||
continue
|
||||
|
||||
for a in meta.attrs:
|
||||
if a in ["itemprop", "property", "name"]:
|
||||
key = str(meta.get(a, ""))
|
||||
content = str(meta.get("content", ""))
|
||||
if key and content: # Only add non-empty content
|
||||
metadata[key] = content
|
||||
break
|
||||
|
||||
# Try reading the description
|
||||
try:
|
||||
for script in soup(["script"]):
|
||||
if not isinstance(script, bs4.Tag):
|
||||
continue
|
||||
if not script.string: # Skip empty scripts
|
||||
continue
|
||||
content = script.string
|
||||
if "ytInitialData" in content:
|
||||
match = re.search(r"var ytInitialData = ({.*?});", content)
|
||||
if match:
|
||||
data = json.loads(match.group(1))
|
||||
attrdesc = self._findKey(data, "attributedDescriptionBodyText")
|
||||
if attrdesc and isinstance(attrdesc, dict):
|
||||
metadata["description"] = str(attrdesc.get("content", ""))
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error extracting description: {e}")
|
||||
pass
|
||||
|
||||
# Start preparing the page
|
||||
webpage_text = "# YouTube\n"
|
||||
|
||||
title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
|
||||
assert isinstance(title, str)
|
||||
|
||||
if title:
|
||||
webpage_text += f"\n## {title}\n"
|
||||
|
||||
stats = ""
|
||||
views = self._get(metadata, ["interactionCount"]) # type: ignore
|
||||
if views:
|
||||
stats += f"- **Views:** {views}\n"
|
||||
|
||||
keywords = self._get(metadata, ["keywords"]) # type: ignore
|
||||
if keywords:
|
||||
stats += f"- **Keywords:** {keywords}\n"
|
||||
|
||||
runtime = self._get(metadata, ["duration"]) # type: ignore
|
||||
if runtime:
|
||||
stats += f"- **Runtime:** {runtime}\n"
|
||||
|
||||
if len(stats) > 0:
|
||||
webpage_text += f"\n### Video Metadata\n{stats}\n"
|
||||
|
||||
description = self._get(metadata, ["description", "og:description"]) # type: ignore
|
||||
if description:
|
||||
webpage_text += f"\n### Description\n{description}\n"
|
||||
|
||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
transcript_text = ""
|
||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||
params = parse_qs(parsed_url.query) # type: ignore
|
||||
if "v" in params and params["v"][0]:
|
||||
video_id = str(params["v"][0])
|
||||
try:
|
||||
youtube_transcript_languages = kwargs.get(
|
||||
"youtube_transcript_languages", ("en",)
|
||||
)
|
||||
# Retry the transcript fetching operation
|
||||
transcript = self._retry_operation(
|
||||
lambda: ytt_api.fetch(
|
||||
video_id, languages=youtube_transcript_languages
|
||||
),
|
||||
retries=3, # Retry 3 times
|
||||
delay=2, # 2 seconds delay between retries
|
||||
)
|
||||
if transcript:
|
||||
transcript_text = " ".join(
|
||||
[part.text for part in transcript]
|
||||
) # type: ignore
|
||||
except Exception as e:
|
||||
print(f"Error fetching transcript: {e}")
|
||||
if transcript_text:
|
||||
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
||||
|
||||
title = title if title else (soup.title.string if soup.title else "")
|
||||
assert isinstance(title, str)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=title,
|
||||
)
|
||||
|
||||
def _get(
|
||||
self,
|
||||
metadata: Dict[str, str],
|
||||
keys: List[str],
|
||||
default: Union[str, None] = None,
|
||||
) -> Union[str, None]:
|
||||
"""Get first non-empty value from metadata matching given keys."""
|
||||
for k in keys:
|
||||
if k in metadata:
|
||||
return metadata[k]
|
||||
return default
|
||||
|
||||
def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
|
||||
"""Recursively search for a key in nested dictionary/list structures."""
|
||||
if isinstance(json, list):
|
||||
for elm in json:
|
||||
ret = self._findKey(elm, key)
|
||||
if ret is not None:
|
||||
return ret
|
||||
elif isinstance(json, dict):
|
||||
for k, v in json.items():
|
||||
if k == key:
|
||||
return json[k]
|
||||
if result := self._findKey(v, key):
|
||||
return result
|
||||
return None
|
||||
|
||||
def _retry_operation(self, operation, retries=3, delay=2):
|
||||
"""Retries the operation if it fails."""
|
||||
attempt = 0
|
||||
while attempt < retries:
|
||||
try:
|
||||
return operation() # Attempt the operation
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1} failed: {e}")
|
||||
if attempt < retries - 1:
|
||||
time.sleep(delay) # Wait before retrying
|
||||
attempt += 1
|
||||
# If all attempts fail, raise the last exception
|
||||
raise Exception(f"Operation failed after {retries} attempts.")
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
import sys
|
||||
import zipfile
|
||||
import io
|
||||
import os
|
||||
|
||||
from typing import BinaryIO, Any, TYPE_CHECKING
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import UnsupportedFormatException, FileConversionException
|
||||
|
||||
# Break otherwise circular import for type hinting
|
||||
if TYPE_CHECKING:
|
||||
from .._markitdown import MarkItDown
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/zip",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".zip"]
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||
|
||||
The converter extracts the ZIP contents to a temporary directory, processes each file
|
||||
using appropriate converters based on file extensions, and then combines the results
|
||||
into a single markdown document. The temporary directory is cleaned up after processing.
|
||||
|
||||
Example output format:
|
||||
```markdown
|
||||
Content from the zip file `example.zip`:
|
||||
|
||||
## File: docs/readme.txt
|
||||
|
||||
This is the content of readme.txt
|
||||
Multiple lines are preserved
|
||||
|
||||
## File: images/example.jpg
|
||||
|
||||
ImageSize: 1920x1080
|
||||
DateTimeOriginal: 2024-02-15 14:30:00
|
||||
Description: A beautiful landscape photo
|
||||
|
||||
## File: data/report.xlsx
|
||||
|
||||
## Sheet1
|
||||
| Column1 | Column2 | Column3 |
|
||||
|---------|---------|---------|
|
||||
| data1 | data2 | data3 |
|
||||
| data4 | data5 | data6 |
|
||||
```
|
||||
|
||||
Key features:
|
||||
- Maintains original file structure in headings
|
||||
- Processes nested files recursively
|
||||
- Uses appropriate converters for each file type
|
||||
- Preserves formatting of converted content
|
||||
- Cleans up temporary files after processing
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
markitdown: "MarkItDown",
|
||||
):
|
||||
super().__init__()
|
||||
self._markitdown = markitdown
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
file_path = stream_info.url or stream_info.local_path or stream_info.filename
|
||||
md_content = f"Content from the zip file `{file_path}`:\n\n"
|
||||
|
||||
with zipfile.ZipFile(file_stream, "r") as zipObj:
|
||||
for name in zipObj.namelist():
|
||||
try:
|
||||
z_file_stream = io.BytesIO(zipObj.read(name))
|
||||
z_file_stream_info = StreamInfo(
|
||||
extension=os.path.splitext(name)[1],
|
||||
filename=os.path.basename(name),
|
||||
)
|
||||
result = self._markitdown.convert_stream(
|
||||
stream=z_file_stream,
|
||||
stream_info=z_file_stream_info,
|
||||
)
|
||||
if result is not None:
|
||||
md_content += f"## File: {name}\n\n"
|
||||
md_content += result.markdown + "\n\n"
|
||||
except UnsupportedFormatException:
|
||||
pass
|
||||
except FileConversionException:
|
||||
pass
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
|
@ -1,279 +0,0 @@
|
|||
import dataclasses
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True, kw_only=True)
|
||||
class FileTestVector(object):
|
||||
filename: str
|
||||
mimetype: str | None
|
||||
charset: str | None
|
||||
url: str | None
|
||||
must_include: List[str]
|
||||
must_not_include: List[str]
|
||||
|
||||
|
||||
GENERAL_TEST_VECTORS = [
|
||||
FileTestVector(
|
||||
filename="test.docx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
||||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
|
||||
"# Abstract",
|
||||
"# Introduction",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"data:image/png;base64...",
|
||||
],
|
||||
must_not_include=[
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.xlsx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"## 09060124-b5e7-4717-9d07-3c046eb",
|
||||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.xls",
|
||||
mimetype="application/vnd.ms-excel",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"## 09060124-b5e7-4717-9d07-3c046eb",
|
||||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.pptx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
|
||||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
|
||||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
|
||||
"1b92870d-e3b5-4e65-8153-919f4ff45592",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||
"2003", # chart value
|
||||
"",
|
||||
],
|
||||
must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_outlook_msg.msg",
|
||||
mimetype="application/vnd.ms-outlook",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"# Email Message",
|
||||
"**From:** test.sender@example.com",
|
||||
"**To:** test.recipient@example.com",
|
||||
"**Subject:** Test Email Message",
|
||||
"## Content",
|
||||
"This is the body of the test email message",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.pdf",
|
||||
mimetype="application/pdf",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"While there is contemporaneous exploration of multi-agent approaches"
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_blog.html",
|
||||
mimetype="text/html",
|
||||
charset="utf-8",
|
||||
url="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math",
|
||||
must_include=[
|
||||
"Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
|
||||
"an example where high cost can easily prevent a generic complex",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_wikipedia.html",
|
||||
mimetype="text/html",
|
||||
charset="utf-8",
|
||||
url="https://en.wikipedia.org/wiki/Microsoft",
|
||||
must_include=[
|
||||
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
|
||||
'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
|
||||
],
|
||||
must_not_include=[
|
||||
"You are encouraged to create an account and log in",
|
||||
"154 languages",
|
||||
"move to sidebar",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_serp.html",
|
||||
mimetype="text/html",
|
||||
charset="utf-8",
|
||||
url="https://www.bing.com/search?q=microsoft+wikipedia",
|
||||
must_include=[
|
||||
"](https://en.wikipedia.org/wiki/Microsoft",
|
||||
"Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
|
||||
"1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
|
||||
],
|
||||
must_not_include=[
|
||||
"https://www.bing.com/ck/a?!&&p=",
|
||||
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_mskanji.csv",
|
||||
mimetype="text/csv",
|
||||
charset="cp932",
|
||||
url=None,
|
||||
must_include=[
|
||||
"| 名前 | 年齢 | 住所 |",
|
||||
"| --- | --- | --- |",
|
||||
"| 佐藤太郎 | 30 | 東京 |",
|
||||
"| 三木英子 | 25 | 大阪 |",
|
||||
"| 髙橋淳 | 35 | 名古屋 |",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.json",
|
||||
mimetype="application/json",
|
||||
charset="ascii",
|
||||
url=None,
|
||||
must_include=[
|
||||
"5b64c88c-b3c3-4510-bcb8-da0b200602d8",
|
||||
"9700dc99-6685-40b4-9a3a-5e406dcb37f3",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_rss.xml",
|
||||
mimetype="text/xml",
|
||||
charset="utf-8",
|
||||
url=None,
|
||||
must_include=[
|
||||
"# The Official Microsoft Blog",
|
||||
"## Ignite 2024: Why nearly 70% of the Fortune 500 now use Microsoft 365 Copilot",
|
||||
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
|
||||
],
|
||||
must_not_include=["<rss", "<feed"],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_notebook.ipynb",
|
||||
mimetype="application/json",
|
||||
charset="ascii",
|
||||
url=None,
|
||||
must_include=[
|
||||
"# Test Notebook",
|
||||
"```python",
|
||||
'print("markitdown")',
|
||||
"```",
|
||||
"## Code Cell Below",
|
||||
],
|
||||
must_not_include=[
|
||||
"nbformat",
|
||||
"nbformat_minor",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test_files.zip",
|
||||
mimetype="application/zip",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
||||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
|
||||
"# Abstract",
|
||||
"# Introduction",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
|
||||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
|
||||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
|
||||
"1b92870d-e3b5-4e65-8153-919f4ff45592",
|
||||
"## 09060124-b5e7-4717-9d07-3c046eb",
|
||||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
|
||||
'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.epub",
|
||||
mimetype="application/epub+zip",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"**Authors:** Test Author",
|
||||
"A test EPUB document for MarkItDown testing",
|
||||
"# Chapter 1: Test Content",
|
||||
"This is a **test** paragraph with some formatting",
|
||||
"* A bullet point",
|
||||
"* Another point",
|
||||
"# Chapter 2: More Content",
|
||||
"*different* style",
|
||||
"> This is a blockquote for testing",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
DATA_URI_TEST_VECTORS = [
|
||||
FileTestVector(
|
||||
filename="test.docx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
||||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
|
||||
"# Abstract",
|
||||
"# Introduction",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSU",
|
||||
],
|
||||
must_not_include=[
|
||||
"data:image/png;base64...",
|
||||
],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.pptx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
|
||||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
|
||||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
|
||||
"1b92870d-e3b5-4e65-8153-919f4ff45592",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||
"2003", # chart value
|
||||
"![This phrase of the caption is Human-written.]", # image caption
|
||||
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
|
||||
],
|
||||
must_not_include=[
|
||||
"",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
#!/usr/bin/env python3 -m pytest
|
||||
import subprocess
|
||||
import pytest
|
||||
from markitdown import __version__
|
||||
|
||||
# This file contains CLI tests that are not directly tested by the FileTestVectors.
|
||||
# This includes things like help messages, version numbers, and invalid flags.
|
||||
|
||||
|
||||
def test_version() -> None:
|
||||
result = subprocess.run(
|
||||
["python", "-m", "markitdown", "--version"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||
assert __version__ in result.stdout, f"Version not found in output: {result.stdout}"
|
||||
|
||||
|
||||
def test_invalid_flag() -> None:
|
||||
result = subprocess.run(
|
||||
["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
assert result.returncode != 0, f"CLI exited with error: {result.stderr}"
|
||||
assert (
|
||||
"unrecognized arguments" in result.stderr
|
||||
), f"Expected 'unrecognized arguments' to appear in STDERR"
|
||||
assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_version()
|
||||
test_invalid_flag()
|
||||
print("All tests passed!")
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
#!/usr/bin/env python3 -m pytest
|
||||
import os
|
||||
import time
|
||||
import pytest
|
||||
import subprocess
|
||||
import locale
|
||||
from typing import List
|
||||
|
||||
if __name__ == "__main__":
|
||||
from _test_vectors import (
|
||||
GENERAL_TEST_VECTORS,
|
||||
DATA_URI_TEST_VECTORS,
|
||||
FileTestVector,
|
||||
)
|
||||
else:
|
||||
from ._test_vectors import (
|
||||
GENERAL_TEST_VECTORS,
|
||||
DATA_URI_TEST_VECTORS,
|
||||
FileTestVector,
|
||||
)
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
UnsupportedFormatException,
|
||||
FileConversionException,
|
||||
StreamInfo,
|
||||
)
|
||||
|
||||
skip_remote = (
|
||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||
) # Don't run these tests in CI
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
|
||||
|
||||
|
||||
# Prepare CLI test vectors (remove vectors that require mockig the url)
|
||||
CLI_TEST_VECTORS: List[FileTestVector] = []
|
||||
for test_vector in GENERAL_TEST_VECTORS:
|
||||
if test_vector.url is not None:
|
||||
continue
|
||||
CLI_TEST_VECTORS.append(test_vector)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def shared_tmp_dir(tmp_path_factory):
|
||||
return tmp_path_factory.mktemp("pytest_tmp")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
||||
def test_output_to_stdout(shared_tmp_dir, test_vector) -> None:
|
||||
"""Test that the CLI outputs to stdout correctly."""
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
"python",
|
||||
"-m",
|
||||
"markitdown",
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in result.stdout
|
||||
for test_string in test_vector.must_not_include:
|
||||
assert test_string not in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
||||
def test_output_to_file(shared_tmp_dir, test_vector) -> None:
|
||||
"""Test that the CLI outputs to a file correctly."""
|
||||
|
||||
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
|
||||
result = subprocess.run(
|
||||
[
|
||||
"python",
|
||||
"-m",
|
||||
"markitdown",
|
||||
"-o",
|
||||
output_file,
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||
assert os.path.exists(output_file), f"Output file not created: {output_file}"
|
||||
|
||||
with open(output_file, "r") as f:
|
||||
output_data = f.read()
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in output_data
|
||||
for test_string in test_vector.must_not_include:
|
||||
assert test_string not in output_data
|
||||
|
||||
os.remove(output_file)
|
||||
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
||||
def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
|
||||
"""Test that the CLI readds from stdin correctly."""
|
||||
|
||||
test_input = b""
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
test_input = stream.read()
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
"python",
|
||||
"-m",
|
||||
"markitdown",
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
],
|
||||
input=test_input,
|
||||
capture_output=True,
|
||||
text=False,
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode(locale.getpreferredencoding())
|
||||
assert (
|
||||
result.returncode == 0
|
||||
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in stdout
|
||||
for test_string in test_vector.must_not_include:
|
||||
assert test_string not in stdout
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run tests that query external urls",
|
||||
)
|
||||
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
||||
def test_convert_url(shared_tmp_dir, test_vector):
|
||||
"""Test the conversion of a stream with no stream info."""
|
||||
# Note: tmp_dir is not used here, but is needed to match the signature
|
||||
|
||||
markitdown = MarkItDown()
|
||||
|
||||
time.sleep(1) # Ensure we don't hit rate limits
|
||||
result = subprocess.run(
|
||||
["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
|
||||
capture_output=True,
|
||||
text=False,
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode(locale.getpreferredencoding())
|
||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in stdout
|
||||
for test_string in test_vector.must_not_include:
|
||||
assert test_string not in stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
||||
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
|
||||
"""Test CLI functionality when keep_data_uris is enabled"""
|
||||
|
||||
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
|
||||
result = subprocess.run(
|
||||
[
|
||||
"python",
|
||||
"-m",
|
||||
"markitdown",
|
||||
"--keep-data-uris",
|
||||
"-o",
|
||||
output_file,
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
||||
assert os.path.exists(output_file), f"Output file not created: {output_file}"
|
||||
|
||||
with open(output_file, "r") as f:
|
||||
output_data = f.read()
|
||||
for test_string in test_vector.must_include:
|
||||
assert test_string in output_data
|
||||
for test_string in test_vector.must_not_include:
|
||||
assert test_string not in output_data
|
||||
|
||||
os.remove(output_file)
|
||||
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
"""Runs this file's tests from the command line."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# General tests
|
||||
for test_function in [
|
||||
test_output_to_stdout,
|
||||
test_output_to_file,
|
||||
test_input_from_stdin_without_hints,
|
||||
test_convert_url,
|
||||
]:
|
||||
for test_vector in CLI_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...",
|
||||
end="",
|
||||
)
|
||||
test_function(tmp_dir, test_vector)
|
||||
print("OK")
|
||||
|
||||
# Data URI tests
|
||||
for test_function in [
|
||||
test_output_to_file_with_data_uris,
|
||||
]:
|
||||
for test_vector in DATA_URI_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...",
|
||||
end="",
|
||||
)
|
||||
test_function(tmp_dir, test_vector)
|
||||
print("OK")
|
||||
|
||||
print("All tests passed!")
|
||||
Binary file not shown.
|
|
@ -1,419 +0,0 @@
|
|||
#!/usr/bin/env python3 -m pytest
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
UnsupportedFormatException,
|
||||
FileConversionException,
|
||||
StreamInfo,
|
||||
)
|
||||
|
||||
# This file contains module tests that are not directly tested by the FileTestVectors.
|
||||
# This includes things like helper functions and runtime conversion options
|
||||
# (e.g., LLM clients, exiftool path, transcription services, etc.)
|
||||
|
||||
skip_remote = (
|
||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||
) # Don't run these tests in CI
|
||||
|
||||
|
||||
# Don't run the llm tests without a key and the client library
|
||||
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
|
||||
try:
|
||||
import openai
|
||||
except ModuleNotFoundError:
|
||||
skip_llm = True
|
||||
|
||||
# Skip exiftool tests if not installed
|
||||
skip_exiftool = shutil.which("exiftool") is None
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
JPG_TEST_EXIFTOOL = {
|
||||
"Author": "AutoGen Authors",
|
||||
"Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"Description": "AutoGen enables diverse LLM-based applications",
|
||||
"ImageSize": "1615x1967",
|
||||
"DateTimeOriginal": "2024:03:14 22:10:00",
|
||||
}
|
||||
|
||||
MP3_TEST_EXIFTOOL = {
|
||||
"Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e",
|
||||
"Artist": "Artist Name Test String",
|
||||
"Album": "Album Name Test String",
|
||||
"SampleRate": "48000",
|
||||
}
|
||||
|
||||
PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
|
||||
PDF_TEST_STRINGS = [
|
||||
"While there is contemporaneous exploration of multi-agent approaches"
|
||||
]
|
||||
|
||||
YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg"
|
||||
YOUTUBE_TEST_STRINGS = [
|
||||
"## AutoGen FULL Tutorial with Python (Step-By-Step)",
|
||||
"This is an intermediate tutorial for installing and using AutoGen locally",
|
||||
"PT15M4S",
|
||||
"the model we're going to be using today is GPT 3.5 turbo", # From the transcript
|
||||
]
|
||||
|
||||
DOCX_COMMENT_TEST_STRINGS = [
|
||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
||||
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
|
||||
"# Abstract",
|
||||
"# Introduction",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"This is a test comment. 12df-321a",
|
||||
"Yet another comment in the doc. 55yiyi-asd09",
|
||||
]
|
||||
|
||||
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
|
||||
BLOG_TEST_STRINGS = [
|
||||
"Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
|
||||
"an example where high cost can easily prevent a generic complex",
|
||||
]
|
||||
|
||||
LLM_TEST_STRINGS = [
|
||||
"5bda1dd6",
|
||||
]
|
||||
|
||||
PPTX_TEST_STRINGS = [
|
||||
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
|
||||
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
|
||||
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
|
||||
"1b92870d-e3b5-4e65-8153-919f4ff45592",
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
|
||||
"2003", # chart value
|
||||
]
|
||||
|
||||
|
||||
# --- Helper Functions ---
|
||||
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||
"""Validate presence or absence of specific strings."""
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for string in expected_strings:
|
||||
assert string in text_content
|
||||
if exclude_strings:
|
||||
for string in exclude_strings:
|
||||
assert string not in text_content
|
||||
|
||||
|
||||
def test_stream_info_operations() -> None:
|
||||
"""Test operations performed on StreamInfo objects."""
|
||||
|
||||
stream_info_original = StreamInfo(
|
||||
mimetype="mimetype.1",
|
||||
extension="extension.1",
|
||||
charset="charset.1",
|
||||
filename="filename.1",
|
||||
local_path="local_path.1",
|
||||
url="url.1",
|
||||
)
|
||||
|
||||
# Check updating all attributes by keyword
|
||||
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||
for keyword in keywords:
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
**{keyword: f"{keyword}.2"}
|
||||
)
|
||||
|
||||
# Make sure the targted attribute is updated
|
||||
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||
|
||||
# Make sure the other attributes are unchanged
|
||||
for k in keywords:
|
||||
if k != keyword:
|
||||
assert getattr(stream_info_original, k) == getattr(
|
||||
updated_stream_info, k
|
||||
)
|
||||
|
||||
# Check updating all attributes by passing a new StreamInfo object
|
||||
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||
for keyword in keywords:
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
StreamInfo(**{keyword: f"{keyword}.2"})
|
||||
)
|
||||
|
||||
# Make sure the targted attribute is updated
|
||||
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||
|
||||
# Make sure the other attributes are unchanged
|
||||
for k in keywords:
|
||||
if k != keyword:
|
||||
assert getattr(stream_info_original, k) == getattr(
|
||||
updated_stream_info, k
|
||||
)
|
||||
|
||||
# Check mixing and matching
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
StreamInfo(extension="extension.2", filename="filename.2"),
|
||||
mimetype="mimetype.3",
|
||||
charset="charset.3",
|
||||
)
|
||||
assert updated_stream_info.extension == "extension.2"
|
||||
assert updated_stream_info.filename == "filename.2"
|
||||
assert updated_stream_info.mimetype == "mimetype.3"
|
||||
assert updated_stream_info.charset == "charset.3"
|
||||
assert updated_stream_info.local_path == "local_path.1"
|
||||
assert updated_stream_info.url == "url.1"
|
||||
|
||||
# Check multiple StreamInfo objects
|
||||
updated_stream_info = stream_info_original.copy_and_update(
|
||||
StreamInfo(extension="extension.4", filename="filename.5"),
|
||||
StreamInfo(mimetype="mimetype.6", charset="charset.7"),
|
||||
)
|
||||
assert updated_stream_info.extension == "extension.4"
|
||||
assert updated_stream_info.filename == "filename.5"
|
||||
assert updated_stream_info.mimetype == "mimetype.6"
|
||||
assert updated_stream_info.charset == "charset.7"
|
||||
assert updated_stream_info.local_path == "local_path.1"
|
||||
assert updated_stream_info.url == "url.1"
|
||||
|
||||
|
||||
def test_data_uris() -> None:
|
||||
# Test basic parsing of data URIs
|
||||
data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
|
||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
||||
assert mime_type == "text/plain"
|
||||
assert len(attributes) == 0
|
||||
assert data == b"Hello, World!"
|
||||
|
||||
data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
|
||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
||||
assert mime_type is None
|
||||
assert len(attributes) == 0
|
||||
assert data == b"Hello, World!"
|
||||
|
||||
data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
|
||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
||||
assert mime_type == "text/plain"
|
||||
assert len(attributes) == 1
|
||||
assert attributes["charset"] == "utf-8"
|
||||
assert data == b"Hello, World!"
|
||||
|
||||
data_uri = "data:,Hello%2C%20World%21"
|
||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
||||
assert mime_type is None
|
||||
assert len(attributes) == 0
|
||||
assert data == b"Hello, World!"
|
||||
|
||||
data_uri = "data:text/plain,Hello%2C%20World%21"
|
||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
||||
assert mime_type == "text/plain"
|
||||
assert len(attributes) == 0
|
||||
assert data == b"Hello, World!"
|
||||
|
||||
data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
|
||||
mime_type, attributes, data = parse_data_uri(data_uri)
|
||||
assert mime_type == "text/plain"
|
||||
assert len(attributes) == 1
|
||||
assert attributes["charset"] == "utf-8"
|
||||
assert data == b"Hello, World!"
|
||||
|
||||
|
||||
def test_file_uris() -> None:
|
||||
# Test file URI with an empty host
|
||||
file_uri = "file:///path/to/file.txt"
|
||||
netloc, path = file_uri_to_path(file_uri)
|
||||
assert netloc is None
|
||||
assert path == "/path/to/file.txt"
|
||||
|
||||
# Test file URI with no host
|
||||
file_uri = "file:/path/to/file.txt"
|
||||
netloc, path = file_uri_to_path(file_uri)
|
||||
assert netloc is None
|
||||
assert path == "/path/to/file.txt"
|
||||
|
||||
# Test file URI with localhost
|
||||
file_uri = "file://localhost/path/to/file.txt"
|
||||
netloc, path = file_uri_to_path(file_uri)
|
||||
assert netloc == "localhost"
|
||||
assert path == "/path/to/file.txt"
|
||||
|
||||
# Test file URI with query parameters
|
||||
file_uri = "file:///path/to/file.txt?param=value"
|
||||
netloc, path = file_uri_to_path(file_uri)
|
||||
assert netloc is None
|
||||
assert path == "/path/to/file.txt"
|
||||
|
||||
# Test file URI with fragment
|
||||
file_uri = "file:///path/to/file.txt#fragment"
|
||||
netloc, path = file_uri_to_path(file_uri)
|
||||
assert netloc is None
|
||||
assert path == "/path/to/file.txt"
|
||||
|
||||
|
||||
def test_docx_comments() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test DOCX processing, with comments and setting style_map on init
|
||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||
result = markitdown_with_style_map.convert(
|
||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
|
||||
)
|
||||
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||
|
||||
|
||||
def test_docx_equations() -> None:
|
||||
markitdown = MarkItDown()
|
||||
docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")
|
||||
result = markitdown.convert(docx_file)
|
||||
|
||||
# Check for inline equation m=1 (wrapped with single $) is present
|
||||
assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found"
|
||||
|
||||
# Find block equations wrapped with double $$ and check if they are present
|
||||
block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content)
|
||||
assert block_equations, "No block equations found in the document."
|
||||
|
||||
|
||||
def test_input_as_strings() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test input from a stream
|
||||
input_data = b"<html><body><h1>Test</h1></body></html>"
|
||||
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||
assert "# Test" in result.text_content
|
||||
|
||||
# Test input with leading blank characters
|
||||
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||
assert "# Test" in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run tests that query external urls",
|
||||
)
|
||||
def test_markitdown_remote() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# By URL
|
||||
result = markitdown.convert(PDF_TEST_URL)
|
||||
for test_string in PDF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
# Youtube
|
||||
result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||
for test_string in YOUTUBE_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run remotely run speech transcription tests",
|
||||
)
|
||||
def test_speech_transcription() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test WAV files, MP3 and M4A files
|
||||
for file_name in ["test.wav", "test.mp3", "test.m4a"]:
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name))
|
||||
result_lower = result.text_content.lower()
|
||||
assert (
|
||||
("1" in result_lower or "one" in result_lower)
|
||||
and ("2" in result_lower or "two" in result_lower)
|
||||
and ("3" in result_lower or "three" in result_lower)
|
||||
and ("4" in result_lower or "four" in result_lower)
|
||||
and ("5" in result_lower or "five" in result_lower)
|
||||
)
|
||||
|
||||
|
||||
def test_exceptions() -> None:
|
||||
# Check that an exception is raised when trying to convert an unsupported format
|
||||
markitdown = MarkItDown()
|
||||
with pytest.raises(UnsupportedFormatException):
|
||||
markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin"))
|
||||
|
||||
# Check that an exception is raised when trying to convert a file that is corrupted
|
||||
with pytest.raises(FileConversionException) as exc_info:
|
||||
markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx"
|
||||
)
|
||||
assert len(exc_info.value.attempts) == 1
|
||||
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_exiftool,
|
||||
reason="do not run if exiftool is not installed",
|
||||
)
|
||||
def test_markitdown_exiftool() -> None:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
assert which_exiftool is not None
|
||||
|
||||
# Test explicitly setting the location of exiftool
|
||||
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
for key in JPG_TEST_EXIFTOOL:
|
||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||
assert target in result.text_content
|
||||
|
||||
# Test setting the exiftool path through an environment variable
|
||||
os.environ["EXIFTOOL_PATH"] = which_exiftool
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
for key in JPG_TEST_EXIFTOOL:
|
||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||
assert target in result.text_content
|
||||
|
||||
# Test some other media types
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3"))
|
||||
for key in MP3_TEST_EXIFTOOL:
|
||||
target = f"{key}: {MP3_TEST_EXIFTOOL[key]}"
|
||||
assert target in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_llm,
|
||||
reason="do not run llm tests without a key",
|
||||
)
|
||||
def test_markitdown_llm() -> None:
|
||||
client = openai.OpenAI()
|
||||
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
|
||||
for test_string in LLM_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
# This is not super precise. It would also accept "red square", "blue circle",
|
||||
# "the square is not blue", etc. But it's sufficient for this test.
|
||||
for test_string in ["red", "circle", "blue", "square"]:
|
||||
assert test_string in result.text_content.lower()
|
||||
|
||||
# Images embedded in PPTX files
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
||||
# LLM Captions are included
|
||||
for test_string in LLM_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
# Standard alt text is included
|
||||
validate_strings(result, PPTX_TEST_STRINGS)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
for test in [
|
||||
test_stream_info_operations,
|
||||
test_data_uris,
|
||||
test_file_uris,
|
||||
test_docx_comments,
|
||||
test_input_as_strings,
|
||||
test_markitdown_remote,
|
||||
test_speech_transcription,
|
||||
test_exceptions,
|
||||
test_markitdown_exiftool,
|
||||
test_markitdown_llm,
|
||||
]:
|
||||
print(f"Running {test.__name__}...", end="")
|
||||
test()
|
||||
print("OK")
|
||||
print("All tests passed!")
|
||||
|
|
@ -1,239 +0,0 @@
|
|||
#!/usr/bin/env python3 -m pytest
|
||||
import os
|
||||
import time
|
||||
import pytest
|
||||
import codecs
|
||||
import base64
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
if __name__ == "__main__":
|
||||
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
||||
else:
|
||||
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
UnsupportedFormatException,
|
||||
FileConversionException,
|
||||
StreamInfo,
|
||||
)
|
||||
|
||||
skip_remote = (
|
||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||
) # Don't run these tests in CI
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_guess_stream_info(test_vector):
|
||||
"""Test the ability to guess stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
local_path = os.path.join(TEST_FILES_DIR, test_vector.filename)
|
||||
expected_extension = os.path.splitext(test_vector.filename)[1]
|
||||
|
||||
with open(local_path, "rb") as stream:
|
||||
guesses = markitdown._get_stream_info_guesses(
|
||||
stream,
|
||||
base_guess=StreamInfo(
|
||||
filename=os.path.basename(test_vector.filename),
|
||||
local_path=local_path,
|
||||
extension=expected_extension,
|
||||
),
|
||||
)
|
||||
|
||||
# For some limited exceptions, we can't guarantee the exact
|
||||
# mimetype or extension, so we'll special-case them here.
|
||||
if test_vector.filename in [
|
||||
"test_outlook_msg.msg",
|
||||
]:
|
||||
return
|
||||
|
||||
assert guesses[0].mimetype == test_vector.mimetype
|
||||
assert guesses[0].extension == expected_extension
|
||||
assert guesses[0].charset == test_vector.charset
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_convert_local(test_vector):
|
||||
"""Test the conversion of a local file."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url
|
||||
)
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_convert_stream_with_hints(test_vector):
|
||||
"""Test the conversion of a stream with full stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
stream_info = StreamInfo(
|
||||
extension=os.path.splitext(test_vector.filename)[1],
|
||||
mimetype=test_vector.mimetype,
|
||||
charset=test_vector.charset,
|
||||
)
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream, stream_info=stream_info, url=test_vector.url
|
||||
)
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_convert_stream_without_hints(test_vector):
|
||||
"""Test the conversion of a stream with no stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(stream, url=test_vector.url)
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run tests that query external urls",
|
||||
)
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_convert_http_uri(test_vector):
|
||||
"""Test the conversion of an HTTP:// or HTTPS:// URI."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
time.sleep(1) # Ensure we don't hit rate limits
|
||||
|
||||
result = markitdown.convert(
|
||||
TEST_FILES_URL + "/" + test_vector.filename,
|
||||
url=test_vector.url, # Mock where this file would be found
|
||||
)
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_convert_file_uri(test_vector):
|
||||
"""Test the conversion of a file:// URI."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
result = markitdown.convert(
|
||||
Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
|
||||
url=test_vector.url,
|
||||
)
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
||||
def test_convert_data_uri(test_vector):
|
||||
"""Test the conversion of a data URI."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
data = ""
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
data = base64.b64encode(stream.read()).decode("utf-8")
|
||||
mimetype = test_vector.mimetype
|
||||
data_uri = f"data:{mimetype};base64,{data}"
|
||||
|
||||
result = markitdown.convert(
|
||||
data_uri,
|
||||
url=test_vector.url,
|
||||
)
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
||||
def test_convert_keep_data_uris(test_vector):
|
||||
"""Test API functionality when keep_data_uris is enabled"""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test local file conversion
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
keep_data_uris=True,
|
||||
url=test_vector.url,
|
||||
)
|
||||
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
||||
def test_convert_stream_keep_data_uris(test_vector):
|
||||
"""Test the conversion of a stream with no stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
stream_info = StreamInfo(
|
||||
extension=os.path.splitext(test_vector.filename)[1],
|
||||
mimetype=test_vector.mimetype,
|
||||
charset=test_vector.charset,
|
||||
)
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
|
||||
)
|
||||
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
"""Runs this file's tests from the command line."""
|
||||
|
||||
# General tests
|
||||
for test_function in [
|
||||
test_guess_stream_info,
|
||||
test_convert_local,
|
||||
test_convert_stream_with_hints,
|
||||
test_convert_stream_without_hints,
|
||||
test_convert_http_uri,
|
||||
test_convert_file_uri,
|
||||
test_convert_data_uri,
|
||||
]:
|
||||
for test_vector in GENERAL_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
||||
)
|
||||
test_function(test_vector)
|
||||
print("OK")
|
||||
|
||||
# Data URI tests
|
||||
for test_function in [
|
||||
test_convert_keep_data_uris,
|
||||
test_convert_stream_keep_data_uris,
|
||||
]:
|
||||
for test_vector in DATA_URI_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
||||
)
|
||||
test_function(test_vector)
|
||||
print("OK")
|
||||
|
||||
print("All tests passed!")
|
||||
|
|
@ -23,4 +23,4 @@ ARG GROUPID=nogroup
|
|||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
ENTRYPOINT [ "markitdown-mcp" ]
|
||||
ENTRYPOINT [ "markitup-mcp" ]
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
# MarkItDown-MCP
|
||||
# MarkItUp-MCP
|
||||
|
||||
[](https://pypi.org/project/markitdown-mcp/)
|
||||

|
||||
[](https://pypi.org/project/markitup-mcp/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
The `markitdown-mcp` package provides a lightweight STDIO and SSE MCP server for calling MarkItDown.
|
||||
The `markitup-mcp` package provides a lightweight STDIO and SSE MCP server for calling MarkItUp.
|
||||
|
||||
It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `https:`, `file:`, or `data:` URI.
|
||||
|
||||
|
|
@ -13,7 +13,7 @@ It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `
|
|||
To install the package, use pip:
|
||||
|
||||
```bash
|
||||
pip install markitdown-mcp
|
||||
pip install markitup-mcp
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
|
@ -22,30 +22,30 @@ To run the MCP server, ussing STDIO (default) use the following command:
|
|||
|
||||
|
||||
```bash
|
||||
markitdown-mcp
|
||||
markitup-mcp
|
||||
```
|
||||
|
||||
To run the MCP server, using SSE use the following command:
|
||||
|
||||
```bash
|
||||
markitdown-mcp --sse --host 127.0.0.1 --port 3001
|
||||
markitup-mcp --sse --host 127.0.0.1 --port 3001
|
||||
```
|
||||
|
||||
## Running in Docker
|
||||
|
||||
To run `markitdown-mcp` in Docker, build the Docker image using the provided Dockerfile:
|
||||
To run `markitup-mcp` in Docker, build the Docker image using the provided Dockerfile:
|
||||
```bash
|
||||
docker build -t markitdown-mcp:latest .
|
||||
docker build -t markitup-mcp:latest .
|
||||
```
|
||||
|
||||
And run it using:
|
||||
```bash
|
||||
docker run -it --rm markitdown-mcp:latest
|
||||
docker run -it --rm markitup-mcp:latest
|
||||
```
|
||||
This will be sufficient for remote URIs. To access local files, you need to mount the local directory into the container. For example, if you want to access files in `/home/user/data`, you can run:
|
||||
|
||||
```bash
|
||||
docker run -it --rm -v /home/user/data:/workdir markitdown-mcp:latest
|
||||
docker run -it --rm -v /home/user/data:/workdir markitup-mcp:latest
|
||||
```
|
||||
|
||||
Once mounted, all files under data will be accessible under `/workdir` in the container. For example, if you have a file `example.txt` in `/home/user/data`, it will be accessible in the container at `/workdir/example.txt`.
|
||||
|
|
@ -61,13 +61,13 @@ Edit it to include the following JSON entry:
|
|||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"markitdown": {
|
||||
"markitup": {
|
||||
"command": "docker",
|
||||
"args": [
|
||||
"run",
|
||||
"--rm",
|
||||
"-i",
|
||||
"markitdown-mcp:latest"
|
||||
"markitup-mcp:latest"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -79,7 +79,7 @@ If you want to mount a directory, adjust it accordingly:
|
|||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"markitdown": {
|
||||
"markitup": {
|
||||
"command": "docker",
|
||||
"args": [
|
||||
"run",
|
||||
|
|
@ -87,7 +87,7 @@ If you want to mount a directory, adjust it accordingly:
|
|||
"-i",
|
||||
"-v",
|
||||
"/home/user/data:/workdir",
|
||||
"markitdown-mcp:latest"
|
||||
"markitup-mcp:latest"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -106,7 +106,7 @@ You can then connect to the insepctor through the specified host and port (e.g.,
|
|||
|
||||
If using STDIO:
|
||||
* select `STDIO` as the transport type,
|
||||
* input `markitdown-mcp` as the command, and
|
||||
* input `markitup-mcp` as the command, and
|
||||
* click `Connect`
|
||||
|
||||
If using SSE:
|
||||
|
|
@ -3,9 +3,9 @@ requires = ["hatchling"]
|
|||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown-mcp"
|
||||
name = "markitup-mcp"
|
||||
dynamic = ["version"]
|
||||
description = 'An MCP server for the "markitdown" library.'
|
||||
description = 'An MCP server for the "markitup" library.'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
|
|
@ -25,38 +25,38 @@ classifiers = [
|
|||
]
|
||||
dependencies = [
|
||||
"mcp~=1.5.0",
|
||||
"markitdown[all]>=0.1.1,<0.2.0",
|
||||
"markitup[all]>=0.1.1,<0.2.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
Documentation = "https://github.com/microsoft/markitup#readme"
|
||||
Issues = "https://github.com/microsoft/markitup/issues"
|
||||
Source = "https://github.com/microsoft/markitup"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown_mcp/__about__.py"
|
||||
path = "src/markitup_mcp/__about__.py"
|
||||
|
||||
[project.scripts]
|
||||
markitdown-mcp = "markitdown_mcp.__main__:main"
|
||||
markitup-mcp = "markitup_mcp.__main__:main"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown_mcp tests}"
|
||||
check = "mypy --install-types --non-interactive {args:src/markitup_mcp tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown-mcp", "tests"]
|
||||
source_pkgs = ["markitup-mcp", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/markitdown_mcp/__about__.py",
|
||||
"src/markitup_mcp/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
markitdown-mcp = ["src/markitdown_mcp", "*/markitdown-mcp/src/markitdown_mcp"]
|
||||
tests = ["tests", "*/markitdown-mcp/tests"]
|
||||
markitup-mcp = ["src/markitup_mcp", "*/markitup-mcp/src/markitup_mcp"]
|
||||
tests = ["tests", "*/markitup-mcp/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
|
|
@ -66,4 +66,4 @@ exclude_lines = [
|
|||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown_mcp"]
|
||||
only-include = ["src/markitup_mcp"]
|
||||
|
|
@ -6,17 +6,17 @@ from mcp.server.sse import SseServerTransport
|
|||
from starlette.requests import Request
|
||||
from starlette.routing import Mount, Route
|
||||
from mcp.server import Server
|
||||
from markitdown import MarkItDown
|
||||
from markitup import MarkItUp
|
||||
import uvicorn
|
||||
|
||||
# Initialize FastMCP server for MarkItDown (SSE)
|
||||
mcp = FastMCP("markitdown")
|
||||
# Initialize FastMCP server for MarkItUp (SSE)
|
||||
mcp = FastMCP("markitup")
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
async def convert_to_markdown(uri: str) -> str:
|
||||
"""Convert a resource described by an http:, https:, file: or data: URI to markdown"""
|
||||
return MarkItDown().convert_uri(uri).markdown
|
||||
return MarkItUp().convert_uri(uri).markdown
|
||||
|
||||
|
||||
def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette:
|
||||
|
|
@ -49,7 +49,7 @@ def main():
|
|||
|
||||
mcp_server = mcp._mcp_server
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run MCP SSE-based MarkItDown server")
|
||||
parser = argparse.ArgumentParser(description="Run MCP SSE-based MarkItUp server")
|
||||
|
||||
parser.add_argument(
|
||||
"--sse",
|
||||
|
|
@ -1,17 +1,17 @@
|
|||
# MarkItDown Sample Plugin
|
||||
# MarkItUp Sample Plugin
|
||||
|
||||
[](https://pypi.org/project/markitdown-sample-plugin/)
|
||||

|
||||
[](https://pypi.org/project/markitup-sample-plugin/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
|
||||
This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows:
|
||||
This project shows how to create a sample plugin for MarkItUp. The most important parts are as follows:
|
||||
|
||||
Next, implement your custom DocumentConverter:
|
||||
|
||||
```python
|
||||
from typing import BinaryIO, Any
|
||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitup import MarkItUp, DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
|
||||
|
|
@ -51,22 +51,22 @@ Next, make sure your package implements and exports the following:
|
|||
# The only supported version is 1 for now.
|
||||
__plugin_interface_version__ = 1
|
||||
|
||||
# The main entrypoint for the plugin. This is called each time MarkItDown instances are created.
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
# The main entrypoint for the plugin. This is called each time MarkItUp instances are created.
|
||||
def register_converters(markitup: MarkItUp, **kwargs):
|
||||
"""
|
||||
Called during construction of MarkItDown instances to register converters provided by plugins.
|
||||
Called during construction of MarkItUp instances to register converters provided by plugins.
|
||||
"""
|
||||
|
||||
# Simply create and attach an RtfConverter instance
|
||||
markitdown.register_converter(RtfConverter())
|
||||
markitup.register_converter(RtfConverter())
|
||||
```
|
||||
|
||||
|
||||
Finally, create an entrypoint in the `pyproject.toml` file:
|
||||
|
||||
```toml
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
sample_plugin = "markitdown_sample_plugin"
|
||||
[project.entry-points."markitup.plugin"]
|
||||
sample_plugin = "markitup_sample_plugin"
|
||||
```
|
||||
|
||||
Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin.
|
||||
|
|
@ -74,30 +74,30 @@ Here, the value of `sample_plugin` can be any key, but should ideally be the nam
|
|||
|
||||
## Installation
|
||||
|
||||
To use the plugin with MarkItDown, it must be installed. To install the plugin from the current directory use:
|
||||
To use the plugin with MarkItUp, it must be installed. To install the plugin from the current directory use:
|
||||
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Once the plugin package is installed, verify that it is available to MarkItDown by running:
|
||||
Once the plugin package is installed, verify that it is available to MarkItUp by running:
|
||||
|
||||
```bash
|
||||
markitdown --list-plugins
|
||||
markitup --list-plugins
|
||||
```
|
||||
|
||||
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
|
||||
|
||||
```bash
|
||||
markitdown --use-plugins path-to-file.rtf
|
||||
markitup --use-plugins path-to-file.rtf
|
||||
```
|
||||
|
||||
In Python, plugins can be enabled as follows:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from markitup import MarkItUp
|
||||
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
md = MarkItUp(enable_plugins=True)
|
||||
result = md.convert("path-to-file.rtf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
|
@ -3,9 +3,9 @@ requires = ["hatchling"]
|
|||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown-sample-plugin"
|
||||
name = "markitup-sample-plugin"
|
||||
dynamic = ["version"]
|
||||
description = 'A sample plugin for the "markitdown" library.'
|
||||
description = 'A sample plugin for the "markitup" library.'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
|
|
@ -24,40 +24,40 @@ classifiers = [
|
|||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dependencies = [
|
||||
"markitdown>=0.1.0a1",
|
||||
"markitup>=0.1.0a1",
|
||||
"striprtf",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
Documentation = "https://github.com/microsoft/markitup#readme"
|
||||
Issues = "https://github.com/microsoft/markitup/issues"
|
||||
Source = "https://github.com/microsoft/markitup"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown_sample_plugin/__about__.py"
|
||||
path = "src/markitup_sample_plugin/__about__.py"
|
||||
|
||||
# IMPORTANT: MarkItDown will look for this entry point to find the plugin.
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
sample_plugin = "markitdown_sample_plugin"
|
||||
[project.entry-points."markitup.plugin"]
|
||||
sample_plugin = "markitup_sample_plugin"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown_sample_plugin tests}"
|
||||
check = "mypy --install-types --non-interactive {args:src/markitup_sample_plugin tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown-sample-plugin", "tests"]
|
||||
source_pkgs = ["markitup-sample-plugin", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/markitdown_sample_plugin/__about__.py",
|
||||
"src/markitup_sample_plugin/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
markitdown-sample-plugin = ["src/markitdown_sample_plugin", "*/markitdown-sample-plugin/src/markitdown_sample_plugin"]
|
||||
tests = ["tests", "*/markitdown-sample-plugin/tests"]
|
||||
markitup-sample-plugin = ["src/markitup_sample_plugin", "*/markitup-sample-plugin/src/markitup_sample_plugin"]
|
||||
tests = ["tests", "*/markitup-sample-plugin/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
|
|
@ -67,4 +67,4 @@ exclude_lines = [
|
|||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown_sample_plugin"]
|
||||
only-include = ["src/markitup_sample_plugin"]
|
||||
|
|
@ -2,8 +2,8 @@ import locale
|
|||
from typing import BinaryIO, Any
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
from markitup import (
|
||||
MarkItUp,
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
StreamInfo,
|
||||
|
|
@ -22,13 +22,13 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
|
|||
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
def register_converters(markitup: MarkItUp, **kwargs):
|
||||
"""
|
||||
Called during construction of MarkItDown instances to register converters provided by plugins.
|
||||
Called during construction of MarkItUp instances to register converters provided by plugins.
|
||||
"""
|
||||
|
||||
# Simply create and attach an RtfConverter instance
|
||||
markitdown.register_converter(RtfConverter())
|
||||
markitup.register_converter(RtfConverter())
|
||||
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
|
|
@ -2,14 +2,14 @@
|
|||
import os
|
||||
import pytest
|
||||
|
||||
from markitdown import MarkItDown, StreamInfo
|
||||
from markitdown_sample_plugin import RtfConverter
|
||||
from markitup import MarkItUp, StreamInfo
|
||||
from markitup_sample_plugin import RtfConverter
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
RTF_TEST_STRINGS = {
|
||||
"This is a Sample RTF File",
|
||||
"It is included to test if the MarkItDown sample plugin can correctly convert RTF files.",
|
||||
"It is included to test if the MarkItUp sample plugin can correctly convert RTF files.",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -28,9 +28,9 @@ def test_converter() -> None:
|
|||
assert test_string in result.text_content
|
||||
|
||||
|
||||
def test_markitdown() -> None:
|
||||
"""Tests that MarkItDown correctly loads the plugin."""
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
def test_markitup() -> None:
|
||||
"""Tests that MarkItUp correctly loads the plugin."""
|
||||
md = MarkItUp(enable_plugins=True)
|
||||
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
|
|
@ -40,5 +40,5 @@ def test_markitdown() -> None:
|
|||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_converter()
|
||||
test_markitdown()
|
||||
test_markitup()
|
||||
print("All tests passed.")
|
||||
|
|
@ -3,10 +3,9 @@ requires = ["hatchling"]
|
|||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown"
|
||||
name = "markitup"
|
||||
dynamic = ["version"]
|
||||
description = 'Utility tool for converting various files to Markdown'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
keywords = []
|
||||
|
|
@ -29,75 +28,53 @@ dependencies = [
|
|||
"markdownify",
|
||||
"magika~=0.6.1",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
all = [
|
||||
"python-magic>=0.4.27",
|
||||
"python-pptx",
|
||||
"mammoth",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"lxml",
|
||||
"pdfminer.six",
|
||||
"olefile",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api~=1.0.0",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
"pymupdf>=1.25.5",
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth", "lxml"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
outlook = ["olefile"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown/__about__.py"
|
||||
path = "src/markitup/__about__.py"
|
||||
|
||||
[project.scripts]
|
||||
markitdown = "markitdown.__main__:main"
|
||||
markitup = "markitup.__main__:main"
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
features = ["all"]
|
||||
# No features needed since everything is installed by default
|
||||
|
||||
[tool.hatch.envs.hatch-test]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
|
||||
check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitup tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown", "tests"]
|
||||
source_pkgs = ["markitup", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/markitdown/__about__.py",
|
||||
"src/markitup/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
|
||||
tests = ["tests", "*/markitdown/tests"]
|
||||
markitup = ["src/markitup", "*/markitup/src/markitup"]
|
||||
tests = ["tests", "*/markitup/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
|
|
@ -107,4 +84,4 @@ exclude_lines = [
|
|||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown"]
|
||||
only-include = ["src/markitup"]
|
||||
|
|
@ -3,15 +3,13 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import (
|
||||
MarkItDown,
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
PRIORITY_GENERIC_FILE_FORMAT,
|
||||
from ._markitup import (
|
||||
MarkItUp,
|
||||
)
|
||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||
from ._stream_info import StreamInfo
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
MarkItUpException,
|
||||
MissingDependencyException,
|
||||
FailedConversionAttempt,
|
||||
FileConversionException,
|
||||
|
|
@ -20,15 +18,13 @@ from ._exceptions import (
|
|||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"MarkItDown",
|
||||
"MarkItUp",
|
||||
"DocumentConverter",
|
||||
"DocumentConverterResult",
|
||||
"MarkItDownException",
|
||||
"MarkItUpException",
|
||||
"MissingDependencyException",
|
||||
"FailedConversionAttempt",
|
||||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
"StreamInfo",
|
||||
"PRIORITY_SPECIFIC_FILE_FORMAT",
|
||||
"PRIORITY_GENERIC_FILE_FORMAT",
|
||||
]
|
||||
|
|
@ -8,40 +8,40 @@ import locale
|
|||
from textwrap import dedent
|
||||
from importlib.metadata import entry_points
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult
|
||||
from ._markitup import MarkItUp, StreamInfo, DocumentConverterResult
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert various file formats to markdown.",
|
||||
prog="markitdown",
|
||||
prog="markitup",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
usage=dedent(
|
||||
"""
|
||||
SYNTAX:
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitdown reads from stdin.
|
||||
markitup <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitup reads from stdin.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
markitdown example.pdf
|
||||
markitup example.pdf
|
||||
|
||||
OR
|
||||
|
||||
cat example.pdf | markitdown
|
||||
cat example.pdf | markitup
|
||||
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
markitup < example.pdf
|
||||
|
||||
OR to save to a file use
|
||||
|
||||
markitdown example.pdf -o example.md
|
||||
markitup example.pdf -o example.md
|
||||
|
||||
OR
|
||||
|
||||
markitdown example.pdf > example.md
|
||||
markitup example.pdf > example.md
|
||||
"""
|
||||
).strip(),
|
||||
)
|
||||
|
|
@ -158,12 +158,12 @@ def main():
|
|||
|
||||
if args.list_plugins:
|
||||
# List installed plugins, then exit
|
||||
print("Installed MarkItDown 3rd-party Plugins:\n")
|
||||
plugin_entry_points = list(entry_points(group="markitdown.plugin"))
|
||||
print("Installed MarkItUp 3rd-party Plugins:\n")
|
||||
plugin_entry_points = list(entry_points(group="markitup.plugin"))
|
||||
if len(plugin_entry_points) == 0:
|
||||
print(" * No 3rd-party plugins installed.")
|
||||
print(
|
||||
"\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n"
|
||||
"\nFind plugins by searching for the hashtag #markitup-plugin on GitHub.\n"
|
||||
)
|
||||
else:
|
||||
for entry_point in plugin_entry_points:
|
||||
|
|
@ -181,20 +181,20 @@ def main():
|
|||
elif args.filename is None:
|
||||
_exit_with_error("Filename is required when using Document Intelligence.")
|
||||
|
||||
markitdown = MarkItDown(
|
||||
markitup = MarkItUp(
|
||||
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
|
||||
)
|
||||
else:
|
||||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||
markitup = MarkItUp(enable_plugins=args.use_plugins)
|
||||
|
||||
if args.filename is None:
|
||||
result = markitdown.convert_stream(
|
||||
result = markitup.convert_stream(
|
||||
sys.stdin.buffer,
|
||||
stream_info=stream_info,
|
||||
keep_data_uris=args.keep_data_uris,
|
||||
)
|
||||
else:
|
||||
result = markitdown.convert(
|
||||
result = markitup.convert(
|
||||
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
|
||||
)
|
||||
|
||||
|
|
@ -1,8 +1,9 @@
|
|||
import os
|
||||
import tempfile
|
||||
from warnings import warn
|
||||
from typing import Any, Union, BinaryIO, Optional, List
|
||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||
from ._stream_info import StreamInfo
|
||||
import re
|
||||
|
||||
|
||||
class DocumentConverterResult:
|
||||
|
|
@ -26,6 +27,61 @@ class DocumentConverterResult:
|
|||
"""
|
||||
self.markdown = markdown
|
||||
self.title = title
|
||||
|
||||
def to_llm(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert markdown with base64 images to a format compatible with OpenAI's API.
|
||||
|
||||
This function parses the markdown content, extracting text and images in their
|
||||
original order, and returns a list of content elements in OpenAI's format.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of dictionaries representing the content elements
|
||||
(text and images) in their original order.
|
||||
"""
|
||||
|
||||
|
||||
# Pattern to match markdown image syntax with base64 data
|
||||
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
|
||||
|
||||
content = []
|
||||
last_end = 0
|
||||
|
||||
# Process the document sequentially to maintain order
|
||||
for match in re.finditer(pattern, self.markdown):
|
||||
# Add the text before this image if any
|
||||
if match.start() > last_end:
|
||||
text_chunk = self.markdown[last_end:match.start()].strip()
|
||||
if text_chunk:
|
||||
content.append({
|
||||
"type": "text",
|
||||
"text": text_chunk
|
||||
})
|
||||
|
||||
# Extract image data
|
||||
alt_text, content_type, b64_data = match.groups()
|
||||
|
||||
# Add the image
|
||||
content.append({
|
||||
"type": "image",
|
||||
"image_url": {
|
||||
"url": f"data:{content_type};base64,{b64_data}"
|
||||
},
|
||||
"alt_text": alt_text
|
||||
})
|
||||
|
||||
last_end = match.end()
|
||||
|
||||
# Add any remaining text after the last image
|
||||
if last_end < len(self.markdown):
|
||||
text_chunk = self.markdown[last_end:].strip()
|
||||
if text_chunk:
|
||||
content.append({
|
||||
"type": "text",
|
||||
"text": text_chunk
|
||||
})
|
||||
|
||||
return content
|
||||
|
||||
@property
|
||||
def text_content(self) -> str:
|
||||
|
|
@ -45,45 +101,6 @@ class DocumentConverterResult:
|
|||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Return a quick determination on if the converter should attempt converting the document.
|
||||
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
|
||||
In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
|
||||
make a determination (e.g., special converters for Wikipedia, YouTube etc).
|
||||
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
|
||||
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
|
||||
|
||||
NOTE: The method signature is designed to match that of the convert() method. This provides some
|
||||
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
|
||||
|
||||
IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
|
||||
determination. Read operations inevitably advances the position in file_stream. In these case, the position
|
||||
MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
|
||||
after accepts(), and will expect the file_stream to be at the original position.
|
||||
|
||||
E.g.,
|
||||
cur_pos = file_stream.tell() # Save the current position
|
||||
data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
|
||||
file_stream.seek(cur_pos) # Reset the position to the original position
|
||||
|
||||
Prameters:
|
||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||
- kwargs: Additional keyword arguments for the converter.
|
||||
|
||||
Returns:
|
||||
- bool: True if the converter can handle the document, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
|
||||
)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -1,24 +1,24 @@
|
|||
from typing import Optional, List, Any
|
||||
|
||||
MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
|
||||
MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItUp. For example:
|
||||
|
||||
* pip install markitdown[{feature}]
|
||||
* pip install markitdown[all]
|
||||
* pip install markitdown[{feature}, ...]
|
||||
* pip install markitup[{feature}]
|
||||
* pip install markitup[all]
|
||||
* pip install markitup[{feature}, ...]
|
||||
* etc."""
|
||||
|
||||
|
||||
class MarkItDownException(Exception):
|
||||
class MarkItUpException(Exception):
|
||||
"""
|
||||
Base exception class for MarkItDown.
|
||||
Base exception class for MarkItUp.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MissingDependencyException(MarkItDownException):
|
||||
class MissingDependencyException(MarkItUpException):
|
||||
"""
|
||||
Converters shipped with MarkItDown may depend on optional
|
||||
Converters shipped with MarkItUp may depend on optional
|
||||
dependencies. This exception is thrown when a converter's
|
||||
convert() method is called, but the required dependency is not
|
||||
installed. This is not necessarily a fatal error, as the converter
|
||||
|
|
@ -31,7 +31,7 @@ class MissingDependencyException(MarkItDownException):
|
|||
pass
|
||||
|
||||
|
||||
class UnsupportedFormatException(MarkItDownException):
|
||||
class UnsupportedFormatException(MarkItUpException):
|
||||
"""
|
||||
Thrown when no suitable converter was found for the given file.
|
||||
"""
|
||||
|
|
@ -49,7 +49,7 @@ class FailedConversionAttempt(object):
|
|||
self.exc_info = exc_info
|
||||
|
||||
|
||||
class FileConversionException(MarkItDownException):
|
||||
class FileConversionException(MarkItUpException):
|
||||
"""
|
||||
Thrown when a suitable converter was found, but the conversion
|
||||
process fails for any reason.
|
||||
100
packages/markitup/src/markitup/_markitup.py
Normal file
100
packages/markitup/src/markitup/_markitup.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
from typing import Any, List, Dict, Optional, Union, BinaryIO
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from warnings import warn
|
||||
import magic
|
||||
|
||||
from ._stream_info import StreamInfo
|
||||
|
||||
from .converters import (
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
# AudioConverter,
|
||||
CsvConverter,
|
||||
)
|
||||
|
||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
|
||||
class MarkItUp:
|
||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||
This reader will convert common file-types or webpages to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
self.config = config
|
||||
|
||||
def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
|
||||
stream_info: StreamInfo = self._get_stream_info(stream)
|
||||
# Deal with unsupported file types
|
||||
match stream_info.category:
|
||||
case "ppt":
|
||||
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
|
||||
case "other":
|
||||
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
|
||||
|
||||
try:
|
||||
match stream_info.category:
|
||||
case "text":
|
||||
return PlainTextConverter().convert(stream, stream_info), stream_info
|
||||
case "pptx":
|
||||
return PptxConverter().convert(stream, stream_info), stream_info
|
||||
case "pdf":
|
||||
return PdfConverter().convert(stream, stream_info), stream_info
|
||||
except FailedConversionAttempt:
|
||||
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
|
||||
return stream_info
|
||||
|
||||
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
||||
original_position = byte_stream.tell()
|
||||
|
||||
# Reset stream position to beginning
|
||||
byte_stream.seek(0)
|
||||
|
||||
# Get file content for analysis
|
||||
file_content = byte_stream.read()
|
||||
|
||||
# Use python-magic to determine file type based on content
|
||||
magic_type = magic.from_buffer(file_content, mime=True)
|
||||
|
||||
# Determine file category based on magic_type
|
||||
if magic_type.startswith("image/"):
|
||||
category = "image"
|
||||
elif magic_type.startswith("audio/"):
|
||||
category = "audio"
|
||||
elif magic_type.startswith("video/"):
|
||||
category = "video"
|
||||
elif magic_type.startswith("application/vnd.ms-excel"):
|
||||
category = 'xls'
|
||||
elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
|
||||
category = "xlsx"
|
||||
elif magic_type.startswith("application/vnd.ms-powerpoint"):
|
||||
category = 'ppt'
|
||||
elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
category = "pptx"
|
||||
elif magic_type.startswith("application/msword"):
|
||||
category = 'doc'
|
||||
elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||
category = "docx"
|
||||
elif magic_type == "application/pdf":
|
||||
category = "pdf"
|
||||
elif magic_type.startswith("text/"):
|
||||
category = "text"
|
||||
else:
|
||||
category = "other"
|
||||
|
||||
byte_stream.seek(original_position)
|
||||
return StreamInfo(magic_type=magic_type, category=category)
|
||||
8
packages/markitup/src/markitup/_stream_info.py
Normal file
8
packages/markitup/src/markitup/_stream_info.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamInfo:
|
||||
magic_type: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
102
packages/markitup/src/markitup/converter_utils/utils.py
Normal file
102
packages/markitup/src/markitup/converter_utils/utils.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import os
|
||||
from io import BytesIO
|
||||
from markitup._stream_info import StreamInfo
|
||||
import magic
|
||||
|
||||
|
||||
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
||||
"""
|
||||
Reads all files from the specified folder into BytesIO objects.
|
||||
|
||||
Args:
|
||||
folder_path (str): Path to the folder containing files
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with filenames as keys and BytesIO objects as values
|
||||
"""
|
||||
byte_streams = {}
|
||||
|
||||
# Check if folder exists
|
||||
if not os.path.exists(folder_path):
|
||||
raise FileNotFoundError(f"Folder '{folder_path}' not found")
|
||||
|
||||
# Iterate through all files in the folder
|
||||
for filename in sorted(os.listdir(folder_path)):
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
|
||||
# Check if it's a file (not a subdirectory)
|
||||
if os.path.isfile(file_path):
|
||||
# Read file in binary mode
|
||||
with open(file_path, "rb") as f:
|
||||
# Create BytesIO object with file content
|
||||
file_bytes = BytesIO(f.read())
|
||||
# Add to dictionary with filename as key
|
||||
byte_streams[filename] = file_bytes
|
||||
# Reset BytesIO position to beginning
|
||||
file_bytes.seek(0)
|
||||
|
||||
return byte_streams
|
||||
|
||||
|
||||
def detect_file_types(file_dict):
|
||||
"""
|
||||
Detects file types for a dictionary of {filename: BytesIO} pairs
|
||||
using only magic type (content-based detection)
|
||||
|
||||
Args:
|
||||
file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with filenames as keys and file type information as values
|
||||
"""
|
||||
result = {}
|
||||
|
||||
for filename, byte_stream in file_dict.items():
|
||||
# Get the original position to reset later
|
||||
original_position = byte_stream.tell()
|
||||
|
||||
# Reset stream position to beginning
|
||||
byte_stream.seek(0)
|
||||
|
||||
# Get file content for analysis
|
||||
file_content = byte_stream.read()
|
||||
|
||||
# Use python-magic to determine file type based on content
|
||||
magic_type = magic.from_buffer(file_content, mime=True)
|
||||
|
||||
# Determine file category based on magic_type
|
||||
if magic_type.startswith("image/"):
|
||||
category = "image"
|
||||
elif magic_type.startswith("audio/"):
|
||||
category = "audio"
|
||||
elif magic_type.startswith("video/"):
|
||||
category = "video"
|
||||
elif (
|
||||
magic_type.startswith("application/vnd.ms-excel")
|
||||
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
):
|
||||
category = "xls"
|
||||
elif (
|
||||
magic_type.startswith("application/vnd.ms-powerpoint")
|
||||
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
):
|
||||
category = "ppt"
|
||||
elif (
|
||||
magic_type.startswith("application/msword")
|
||||
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
category = "doc"
|
||||
elif magic_type == "application/pdf":
|
||||
category = "pdf"
|
||||
elif magic_type.startswith("text/"):
|
||||
category = "text"
|
||||
else:
|
||||
category = "other"
|
||||
|
||||
# Store the results
|
||||
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
||||
|
||||
# Reset stream position
|
||||
byte_stream.seek(original_position)
|
||||
|
||||
return result
|
||||
|
|
@ -4,30 +4,19 @@
|
|||
|
||||
from ._plain_text_converter import PlainTextConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._rss_converter import RssConverter
|
||||
from ._wikipedia_converter import WikipediaConverter
|
||||
from ._youtube_converter import YouTubeConverter
|
||||
from ._ipynb_converter import IpynbConverter
|
||||
from ._bing_serp_converter import BingSerpConverter
|
||||
from ._pdf_converter import PdfConverter
|
||||
from ._docx_converter import DocxConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._outlook_msg_converter import OutlookMsgConverter
|
||||
from ._zip_converter import ZipConverter
|
||||
from ._doc_intel_converter import (
|
||||
DocumentIntelligenceConverter,
|
||||
DocumentIntelligenceFileType,
|
||||
)
|
||||
from ._epub_converter import EpubConverter
|
||||
# from ._audio_converter import AudioConverter
|
||||
from ._csv_converter import CsvConverter
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
__all__ = [
|
||||
"PlainTextConverter",
|
||||
"HtmlConverter",
|
||||
"RssConverter",
|
||||
"_CustomMarkdownify",
|
||||
"WikipediaConverter",
|
||||
"YouTubeConverter",
|
||||
"IpynbConverter",
|
||||
|
|
@ -38,7 +27,7 @@ __all__ = [
|
|||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
"ImageConverter",
|
||||
"AudioConverter",
|
||||
# "AudioConverter",
|
||||
"OutlookMsgConverter",
|
||||
"ZipConverter",
|
||||
"DocumentIntelligenceConverter",
|
||||
|
|
@ -2,7 +2,6 @@ import io
|
|||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from ._exiftool import exiftool_metadata
|
||||
from ._transcribe_audio import transcribe_audio
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
|
@ -6,12 +6,12 @@ from .._base_converter import DocumentConverter, DocumentConverterResult
|
|||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
ACCEPTED_FILE_CATEGORY = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
|
@ -19,25 +19,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
|||
|
||||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -45,7 +26,7 @@ class HtmlConverter(DocumentConverter):
|
|||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
encoding = "utf-8"
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
|
|
@ -81,10 +62,8 @@ class HtmlConverter(DocumentConverter):
|
|||
return self.convert(
|
||||
file_stream=io.BytesIO(html_content.encode("utf-8")),
|
||||
stream_info=StreamInfo(
|
||||
mimetype="text/html",
|
||||
extension=".html",
|
||||
charset="utf-8",
|
||||
url=url,
|
||||
magic_type="text/html",
|
||||
category="text",
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
|
@ -108,4 +108,4 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
return "" % (alt, src, title_part)
|
||||
|
||||
def convert_soup(self, soup: Any) -> str:
|
||||
return super().convert_soup(soup) # type: ignore
|
||||
return super().convert_soup(soup) # type: ignore
|
||||
62
packages/markitup/src/markitup/converters/_pdf_converter.py
Normal file
62
packages/markitup/src/markitup/converters/_pdf_converter.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
from typing import BinaryIO, Any
|
||||
import io
|
||||
import base64
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
import fitz
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PDFs to Markdown with embedded images.
|
||||
"""
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Create a document object from the stream
|
||||
doc = fitz.open(stream=file_stream, filetype="pdf")
|
||||
|
||||
# Extract text and images from all pages
|
||||
markdown_content = ""
|
||||
image_count = 0
|
||||
for page_num in range(len(doc)):
|
||||
page = doc.load_page(page_num)
|
||||
|
||||
# Get text with the default "text" mode which gives plain text
|
||||
page_text = page.get_text("text")
|
||||
# Add page marker
|
||||
markdown_content += f"\n\n## Page {page_num + 1}\n\n"
|
||||
markdown_content += page_text + "\n\n"
|
||||
|
||||
# Extract images from the page
|
||||
image_list = page.get_images(full=True)
|
||||
|
||||
for img_index, img_info in enumerate(image_list):
|
||||
xref = img_info[0] # Get the image reference
|
||||
base_image = doc.extract_image(xref)
|
||||
|
||||
if base_image:
|
||||
image_bytes = base_image["image"]
|
||||
image_ext = base_image["ext"]
|
||||
|
||||
try:
|
||||
# Convert image to base64 for markdown embedding
|
||||
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||
# Add image to markdown with a unique identifier
|
||||
image_count += 1
|
||||
markdown_content += f"\n\n"
|
||||
except Exception as e:
|
||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||
|
||||
# Close the document to free resources
|
||||
doc.close()
|
||||
print(markdown_content)
|
||||
return DocumentConverterResult(
|
||||
markdown=markdown_content,
|
||||
)
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
text_content = str(from_bytes(file_stream.read()).best())
|
||||
return DocumentConverterResult(markdown=text_content)
|
||||
|
|
@ -9,26 +9,16 @@ from typing import BinaryIO, Any
|
|||
from operator import attrgetter
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._llm_caption import llm_caption
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pptx
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
import pptx
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
|
||||
ACCEPTED_FILE_CATEGORY = [".pptx"]
|
||||
|
||||
|
||||
class PptxConverter(DocumentConverter):
|
||||
|
|
@ -40,43 +30,12 @@ class PptxConverter(DocumentConverter):
|
|||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
# Perform the conversion
|
||||
presentation = pptx.Presentation(file_stream)
|
||||
|
|
@ -95,39 +54,8 @@ class PptxConverter(DocumentConverter):
|
|||
if self._is_picture(shape):
|
||||
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
||||
|
||||
llm_description = ""
|
||||
alt_text = ""
|
||||
|
||||
# Potentially generate a description using an LLM
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
# Prepare a file_stream and stream_info for the image data
|
||||
image_filename = shape.image.filename
|
||||
image_extension = None
|
||||
if image_filename:
|
||||
image_extension = os.path.splitext(image_filename)[1]
|
||||
image_stream_info = StreamInfo(
|
||||
mimetype=shape.image.content_type,
|
||||
extension=image_extension,
|
||||
filename=image_filename,
|
||||
)
|
||||
|
||||
image_stream = io.BytesIO(shape.image.blob)
|
||||
|
||||
# Caption the image
|
||||
try:
|
||||
llm_description = llm_caption(
|
||||
image_stream,
|
||||
image_stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
except Exception:
|
||||
# Unable to generate a description
|
||||
pass
|
||||
|
||||
# Also grab any description embedded in the deck
|
||||
try:
|
||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
||||
|
|
@ -136,20 +64,17 @@ class PptxConverter(DocumentConverter):
|
|||
pass
|
||||
|
||||
# Prepare the alt, escaping any special characters
|
||||
alt_text = "\n".join([llm_description, alt_text]) or shape.name
|
||||
alt_text = "\n".join([alt_text]) or shape.name
|
||||
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||
|
||||
# If keep_data_uris is True, use base64 encoding for images
|
||||
if kwargs.get("keep_data_uris", False):
|
||||
blob = shape.image.blob
|
||||
content_type = shape.image.content_type or "image/png"
|
||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
else:
|
||||
# A placeholder name
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += "\n\n"
|
||||
|
||||
blob = shape.image.blob
|
||||
content_type = shape.image.content_type or "image/png"
|
||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||
md_content += f"\n\n"
|
||||
|
||||
|
||||
# Tables
|
||||
if self._is_table(shape):
|
||||
|
Before Width: | Height: | Size: 463 KiB After Width: | Height: | Size: 463 KiB |
BIN
packages/markitup/tests/test_files/test.pdf
Normal file
BIN
packages/markitup/tests/test_files/test.pdf
Normal file
Binary file not shown.
BIN
packages/markitup/tests/test_files/test.ppt
Normal file
BIN
packages/markitup/tests/test_files/test.ppt
Normal file
Binary file not shown.
4
packages/markitup/tests/test_files/test.txt
Normal file
4
packages/markitup/tests/test_files/test.txt
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
|
||||
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
|
||||
This sample TXT file is provided by Sample-Files.com. Visit us for more sample files and resources.
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue