Added .docx to optional dependencies

This commit is contained in:
Adam Fourney 2025-02-28 17:06:59 -08:00
parent b9487b6b6d
commit 98698a64ce
3 changed files with 31 additions and 5 deletions

View file

@ -26,7 +26,6 @@ classifiers = [
dependencies = [ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"mammoth",
"markdownify~=0.14.1", "markdownify~=0.14.1",
"numpy", "numpy",
"pandas", "pandas",
@ -46,8 +45,12 @@ dependencies = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
all = ["python-pptx"] all = [
"python-pptx",
"mammoth"
]
pptx = ["python-pptx"] pptx = ["python-pptx"]
docx = ["mammoth"]
[project.urls] [project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme" Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -1,6 +1,6 @@
from typing import Union import sys
import mammoth from typing import Union
from ._base import ( from ._base import (
DocumentConverterResult, DocumentConverterResult,
@ -8,6 +8,16 @@ from ._base import (
from ._base import DocumentConverter from ._base import DocumentConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import mammoth
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
@ -26,6 +36,19 @@ class DocxConverter(HtmlConverter):
if extension.lower() != ".docx": if extension.lower() != ".docx":
return None return None
# Load the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a potential .docx file, but the dependencies needed to read .docx files have not been installed. To resolve this error, include the optional dependency [docx] or [all] when installing MarkItDown. For example:
* pip install markitdown[docx]
* pip install markitdown[all]
* pip install markitdown[pptx, docx, ...]
* etc."""
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
result = None result = None
with open(local_path, "rb") as docx_file: with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)

View file

@ -67,7 +67,7 @@ class PptxConverter(HtmlConverter):
# Load the dependencies # Load the dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example: f"""{type(self).__name__} recognized the input as a potential .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example:
* pip install markitdown[pptx] * pip install markitdown[pptx]
* pip install markitdown[all] * pip install markitdown[all]