Updated READMEs, and finished remaining feature-categories.

This commit is contained in:
Adam Fourney 2025-02-28 22:53:11 -08:00
parent a2cf8ee889
commit 53feead8ae
5 changed files with 67 additions and 23 deletions

View file

@ -5,7 +5,8 @@
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
> [!IMPORTANT] > [!IMPORTANT]
> MarkItDown 0.0.2 alpha 1 (0.0.2a1) introduces a plugin-based architecture. As much as was possible, command-line and Python interfaces have remained the same as 0.0.1a3 to support backward compatibility. Please report any issues you encounter. Some interface changes may yet occur as we continue to refine MarkItDown to a first non-alpha release. > Breaking changes between 0.0.1 to 0.0.2:
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
It supports: It supports:
@ -22,12 +23,12 @@ It supports:
- Youtube URLs - Youtube URLs
- ... and more! - ... and more!
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
```bash ```bash
git clone git@github.com:microsoft/markitdown.git git clone git@github.com:microsoft/markitdown.git
cd markitdown cd markitdown
pip install -e packages/markitdown pip install -e packages/markitdown[all]
``` ```
## Usage ## Usage
@ -50,6 +51,28 @@ You can also pipe content:
cat path-to-file.pdf | markitdown cat path-to-file.pdf | markitdown
``` ```
### Optional Dependencies
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
```bash
pip install markitdown[pdf, docx, pptx]
```
will install only the dependencies for PDF, DOCX, and PPTX files.
At the moment, the following optional dependencies are available:
* `[all]` Installs all optional dependencies
* `[pptx]` Installs dependencies for PowerPoint files
* `[docx]` Installs dependencies for Word files
* `[xlsx]` Installs dependencies for Excel files
* `[xls]` Installs dependencies for older Excel files
* `[pdf]` Installs dependencies for PDF files
* `[outlook]` Installs dependencies for Outlook messages
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
### Plugins ### Plugins
MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins: MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins:

View file

@ -10,7 +10,7 @@
From PyPI: From PyPI:
```bash ```bash
pip install markitdown pip install markitdown[all]
``` ```
From source: From source:
@ -18,7 +18,7 @@ From source:
```bash ```bash
git clone git@github.com:microsoft/markitdown.git git clone git@github.com:microsoft/markitdown.git
cd markitdown cd markitdown
pip install -e packages/markitdown pip install -e packages/markitdown[all]
``` ```
## Usage ## Usage

View file

@ -27,16 +27,9 @@ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify~=0.14.1", "markdownify~=0.14.1",
"numpy",
"puremagic", "puremagic",
"pydub",
"youtube-transcript-api",
"SpeechRecognition",
"pathvalidate", "pathvalidate",
"charset-normalizer", "charset-normalizer",
"openai",
"azure-ai-documentintelligence",
"azure-identity"
] ]
[project.optional-dependencies] [project.optional-dependencies]
@ -47,7 +40,13 @@ all = [
"openpyxl", "openpyxl",
"xlrd", "xlrd",
"pdfminer.six", "pdfminer.six",
"olefile" "olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api",
"openai",
"azure-ai-documentintelligence",
"azure-identity"
] ]
pptx = ["python-pptx"] pptx = ["python-pptx"]
docx = ["mammoth"] docx = ["mammoth"]
@ -55,6 +54,10 @@ xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"] xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"] pdf = ["pdfminer.six"]
outlook = ["olefile"] outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
openai = ["openai"]
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
[project.urls] [project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme" Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -1,16 +1,24 @@
from typing import Any, Union from typing import Any, Union
import re import re
import sys
# Azure imports from ._base import DocumentConverter, DocumentConverterResult
from azure.ai.documentintelligence import DocumentIntelligenceClient from .._exceptions import MissingDependencyException
from azure.ai.documentintelligence.models import (
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest, AnalyzeDocumentRequest,
AnalyzeResult, AnalyzeResult,
DocumentAnalysisFeature, DocumentAnalysisFeature,
) )
from azure.identity import DefaultAzureCredential from azure.identity import DefaultAzureCredential
except ImportError:
from ._base import DocumentConverter, DocumentConverterResult # Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@ -30,6 +38,16 @@ class DocumentIntelligenceConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
# Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated
# unless explicitly requested.
if _dependency_exc_info is not None:
raise MissingDependencyException(
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
self.endpoint = endpoint self.endpoint = endpoint
self.api_version = api_version self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient( self.doc_intel_client = DocumentIntelligenceClient(

View file

@ -7,7 +7,7 @@ import mimetypes
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
""" """
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def __init__( def __init__(