Merge branch 'microsoft:main' into Branch-default-issue-template
This commit is contained in:
commit
3cf405792d
10 changed files with 116 additions and 55 deletions
|
|
@ -6,7 +6,10 @@
|
||||||
// Sets the run context to one level up instead of the .devcontainer folder.
|
// Sets the run context to one level up instead of the .devcontainer folder.
|
||||||
"context": "..",
|
"context": "..",
|
||||||
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
|
||||||
"dockerfile": "../Dockerfile"
|
"dockerfile": "../Dockerfile",
|
||||||
|
"args": {
|
||||||
|
"INSTALL_GIT": "true"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||||
|
|
|
||||||
6
.github/dependabot.yml
vendored
Normal file
6
.github/dependabot.yml
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
version: 2
|
||||||
|
updates:
|
||||||
|
- package-ecosystem: "github-actions"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
|
|
@ -5,9 +5,9 @@ jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.x"
|
python-version: "3.x"
|
||||||
|
|
||||||
|
|
|
||||||
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
|
|
@ -5,8 +5,8 @@ jobs:
|
||||||
tests:
|
tests:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: |
|
python-version: |
|
||||||
3.10
|
3.10
|
||||||
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
3.12
|
3.12
|
||||||
- name: Set up pip cache
|
- name: Set up pip cache
|
||||||
if: runner.os == 'Linux'
|
if: runner.os == 'Linux'
|
||||||
uses: actions/cache@v3
|
uses: actions/cache@v4
|
||||||
with:
|
with:
|
||||||
path: ~/.cache/pip
|
path: ~/.cache/pip
|
||||||
key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
|
key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
|
||||||
|
|
|
||||||
|
|
@ -2,10 +2,15 @@ FROM python:3.13-slim-bullseye
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
|
ARG INSTALL_GIT=false
|
||||||
|
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||||
|
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Runtime dependency
|
# Runtime dependency
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip install markitdown
|
RUN pip install markitdown
|
||||||
|
|
||||||
|
|
|
||||||
21
README.md
21
README.md
|
|
@ -5,6 +5,7 @@
|
||||||
|
|
||||||
[](https://pypi.org/project/markitdown/)
|
[](https://pypi.org/project/markitdown/)
|
||||||

|

|
||||||
|
[](https://github.com/microsoft/autogen)
|
||||||
|
|
||||||
|
|
||||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||||
|
|
@ -29,6 +30,12 @@ To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can
|
||||||
markitdown path-to-file.pdf > document.md
|
markitdown path-to-file.pdf > document.md
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or use `-o` to specify the output file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
markitdown path-to-file.pdf -o document.md
|
||||||
|
```
|
||||||
|
|
||||||
You can also pipe content:
|
You can also pipe content:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -116,6 +123,20 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
|
||||||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||||
|
|
||||||
|
### How to Contribute
|
||||||
|
|
||||||
|
You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help faciliate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
|
||||||
|
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
| | All | Especially Needs Help from Community |
|
||||||
|
|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
|
||||||
|
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
|
||||||
|
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Running Tests and Checks
|
### Running Tests and Checks
|
||||||
|
|
||||||
- Install `hatch` in your environment and run tests:
|
- Install `hatch` in your environment and run tests:
|
||||||
|
|
|
||||||
|
|
@ -1,48 +1,80 @@
|
||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
import sys
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import sys
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from ._markitdown import MarkItDown
|
from .__about__ import __version__
|
||||||
|
from ._markitdown import MarkItDown, DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Convert various file formats to markdown.",
|
description="Convert various file formats to markdown.",
|
||||||
|
prog="markitdown",
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
usage=dedent(
|
usage=dedent(
|
||||||
"""
|
"""
|
||||||
SYNTAX:
|
SYNTAX:
|
||||||
|
|
||||||
markitdown <OPTIONAL: FILENAME>
|
markitdown <OPTIONAL: FILENAME>
|
||||||
If FILENAME is empty, markitdown reads from stdin.
|
If FILENAME is empty, markitdown reads from stdin.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
|
|
||||||
markitdown example.pdf
|
markitdown example.pdf
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
|
cat example.pdf | markitdown
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
|
markitdown < example.pdf
|
||||||
|
|
||||||
|
OR to save to a file use
|
||||||
|
|
||||||
|
markitdown example.pdf -o example.md
|
||||||
|
|
||||||
OR
|
OR
|
||||||
|
|
||||||
cat example.pdf | markitdown
|
markitdown example.pdf > example.md
|
||||||
|
|
||||||
OR
|
|
||||||
|
|
||||||
markitdown < example.pdf
|
|
||||||
"""
|
"""
|
||||||
).strip(),
|
).strip(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-v",
|
||||||
|
"--version",
|
||||||
|
action="version",
|
||||||
|
version=f"%(prog)s {__version__}",
|
||||||
|
help="show the version number and exit",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("filename", nargs="?")
|
parser.add_argument("filename", nargs="?")
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output",
|
||||||
|
help="Output file name. If not provided, output is written to stdout.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.filename is None:
|
if args.filename is None:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||||
print(result.text_content)
|
_handle_output(args, result)
|
||||||
else:
|
else:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
result = markitdown.convert(args.filename)
|
result = markitdown.convert(args.filename)
|
||||||
|
_handle_output(args, result)
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_output(args, result: DocumentConverterResult):
|
||||||
|
"""Handle output to stdout or file"""
|
||||||
|
if args.output:
|
||||||
|
with open(args.output, "w", encoding="utf-8") as f:
|
||||||
|
f.write(result.text_content)
|
||||||
|
else:
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import warn, resetwarnings, catch_warnings
|
||||||
|
|
||||||
|
|
@ -1286,11 +1287,11 @@ class MarkItDown:
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
- source: can be a string representing a path or url, or a requests.response object
|
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -1307,10 +1308,14 @@ class MarkItDown:
|
||||||
# Request response
|
# Request response
|
||||||
elif isinstance(source, requests.Response):
|
elif isinstance(source, requests.Response):
|
||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
|
elif isinstance(source, Path):
|
||||||
|
return self.convert_local(source, **kwargs)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: str, **kwargs: Any
|
self, path: Union[str, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
|
if isinstance(path, Path):
|
||||||
|
path = str(path)
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
extensions = [ext] if ext is not None else []
|
extensions = [ext] if ext is not None else []
|
||||||
|
|
|
||||||
0
src/markitdown/py.typed
Normal file
0
src/markitdown/py.typed
Normal file
|
|
@ -131,6 +131,17 @@ LLM_TEST_STRINGS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||||
|
"""Validate presence or absence of specific strings."""
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for string in expected_strings:
|
||||||
|
assert string in text_content
|
||||||
|
if exclude_strings:
|
||||||
|
for string in exclude_strings:
|
||||||
|
assert string not in text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
reason="do not run tests that query external urls",
|
reason="do not run tests that query external urls",
|
||||||
|
|
@ -163,73 +174,53 @@ def test_markitdown_local() -> None:
|
||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||||
for test_string in XLSX_TEST_STRINGS:
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test DOCX processing
|
# Test DOCX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
||||||
for test_string in DOCX_TEST_STRINGS:
|
validate_strings(result, DOCX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test DOCX processing, with comments
|
# Test DOCX processing, with comments
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
|
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
|
||||||
style_map="comment-reference => ",
|
style_map="comment-reference => ",
|
||||||
)
|
)
|
||||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test DOCX processing, with comments and setting style_map on init
|
# Test DOCX processing, with comments and setting style_map on init
|
||||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||||
result = markitdown_with_style_map.convert(
|
result = markitdown_with_style_map.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
|
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
|
||||||
)
|
)
|
||||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test PPTX processing
|
# Test PPTX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
||||||
for test_string in PPTX_TEST_STRINGS:
|
validate_strings(result, PPTX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test HTML processing
|
# Test HTML processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
||||||
)
|
)
|
||||||
for test_string in BLOG_TEST_STRINGS:
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test ZIP file processing
|
# Test ZIP file processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||||
for test_string in DOCX_TEST_STRINGS:
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test Wikipedia processing
|
# Test Wikipedia processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
||||||
)
|
)
|
||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
for test_string in WIKIPEDIA_TEST_EXCLUDES:
|
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||||
assert test_string not in text_content
|
|
||||||
for test_string in WIKIPEDIA_TEST_STRINGS:
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test Bing processing
|
# Test Bing processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
|
||||||
)
|
)
|
||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
for test_string in SERP_TEST_EXCLUDES:
|
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||||
assert test_string not in text_content
|
|
||||||
for test_string in SERP_TEST_STRINGS:
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test RSS processing
|
# Test RSS processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
|
||||||
|
|
@ -239,9 +230,7 @@ def test_markitdown_local() -> None:
|
||||||
|
|
||||||
## Test non-UTF-8 encoding
|
## Test non-UTF-8 encoding
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||||
text_content = result.text_content.replace("\\", "")
|
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||||
for test_string in CSV_CP932_TEST_STRINGS:
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue