diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..f12fbcb --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Existing Dockerfile", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile" + }, + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + "features": { + "ghcr.io/devcontainers-extra/features/hatch:2": {} + }, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "cat /etc/os-release", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + "remoteUser": "root" +} diff --git a/Dockerfile b/Dockerfile index 492ad8a..f9c0bef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,11 @@ -FROM python:3.13-alpine +FROM python:3.13-slim-bullseye USER root # Runtime dependency -RUN apk add --no-cache ffmpeg +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* RUN pip install markitdown diff --git a/README.md b/README.md index 7079dbf..75c2ba0 100644 --- a/README.md +++ b/README.md @@ -2,65 +2,47 @@ [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) -The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) +MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). +It supports: +- PDF +- PowerPoint +- Word +- Excel +- Images (EXIF metadata and OCR) +- Audio (EXIF metadata and speech transcription) +- HTML +- Text-based formats (CSV, JSON, XML) +- ZIP files (iterates over contents) -It presently supports: +To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .` -- PDF (.pdf) -- PowerPoint (.pptx) -- Word (.docx) -- Excel (.xlsx) -- Images (EXIF metadata, and OCR) -- Audio (EXIF metadata, and speech transcription) -- HTML (special handling of Wikipedia, etc.) -- Various other text-based formats (csv, json, xml, etc.) -- ZIP (Iterates over contents and converts each file) +## Usage -# Installation - -You can install `markitdown` using pip: - -```python -pip install markitdown -``` - -or from the source - -```sh -pip install -e . -``` - -# Usage -The API is simple: - -```python -from markitdown import MarkItDown - -markitdown = MarkItDown() -result = markitdown.convert("test.xlsx") -print(result.text_content) -``` - -To use this as a command-line utility, install it and then run it like this: - -```bash -markitdown path-to-file.pdf -``` - -This will output Markdown to standard output. You can save it like this: +### Command-Line ```bash markitdown path-to-file.pdf > document.md ``` -You can pipe content to standard input by omitting the argument: +You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client. +### Python API +Basic usage in Python: + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("test.xlsx") +print(result.text_content) +``` + +To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: ```python from markitdown import MarkItDown @@ -72,7 +54,7 @@ result = md.convert("example.jpg") print(result.text_content) ``` -You can also use the project as Docker Image: +### Docker ```sh docker build -t markitdown:latest . @@ -93,28 +75,27 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -### Running Tests +### Running Tests and Checks -To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install). +- Install `hatch` in your environment and run tests: + ```sh + pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/ + hatch shell + hatch test + ``` -```sh -pip install hatch -hatch shell -hatch test -``` + (Alternative) Use the Devcontainer which has all the dependencies installed: + ```sh + # Reopen the project in Devcontainer and run: + hatch test + ``` -### Running Pre-commit Checks - -Please run the pre-commit checks before submitting a PR. - -```sh -pre-commit run --all-files -``` +- Run pre-commit checks before submitting a PR: `pre-commit run --all-files` ## Trademarks -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies.